From ca9e995d2ea1fde742ffb7b8466bb9ca4e71d497 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Sat, 16 Nov 2024 16:14:53 +0800
Subject: [PATCH] 11.16

---
 flask_app/general/doubao.py          |   3 +-
 flask_app/货物标/基础信息解析main.py |   4 +-
 flask_app/货物标/技术参数要求提取.py | 130 ++++++++++++++-------------
 flask_app/货物标/提取采购需求main.py |   7 +-
 flask_app/货物标/货物标解析main.py   |  13 +--
 5 files changed, 77 insertions(+), 80 deletions(-)

diff --git a/flask_app/general/doubao.py b/flask_app/general/doubao.py
index 685c051..f3581db 100644
--- a/flask_app/general/doubao.py
+++ b/flask_app/general/doubao.py
@@ -46,6 +46,7 @@ def read_txt_to_string(file_path):
     except Exception as e:
         return f"错误：读取文件时发生错误。详细信息：{e}"
 def doubao_model(full_user_query):
+    print("call doubao...")
     # 相关参数
     url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
     api_key = "ad0c363f-1f23-4b13-aba3-698a4f8c3eb8"
@@ -64,7 +65,7 @@ def doubao_model(full_user_query):
                 "content": full_user_query
             }
         ],
-        "temperature":0.5
+        "temperature":0.3
     }
     response = requests.post(url, headers=headers, json=data)
 
diff --git a/flask_app/货物标/基础信息解析main.py b/flask_app/货物标/基础信息解析main.py
index 1fe9ed2..2f039fd 100644
--- a/flask_app/货物标/基础信息解析main.py
+++ b/flask_app/货物标/基础信息解析main.py
@@ -176,7 +176,7 @@ def get_base_info(merged_baseinfo_path,clause_path):
     #         baseinfo_list.append(clean_json_string(response))
     return baseinfo_list
 
-def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path,invalid_path):
+def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invalid_path):
     baseinfo_list = []
     temp_list = []
     procurement_reqs = {}
@@ -187,7 +187,7 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpat
     # 定义一个线程函数来获取采购需求
     def fetch_procurement_reqs_thread():
         nonlocal procurement_reqs
-        procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path)
+        procurement_reqs = fetch_procurement_reqs(procurement_path,invalid_path)
     # 创建并启动获取基础信息的线程
     thread1 = threading.Thread(target=get_base_info_thread)
     thread1.start()
diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py
index 5eebc00..a087c5d 100644
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@@ -202,7 +202,8 @@ def get_technical_requirements(file_path,invalid_path):
                     6.最后一级键内值留空或填'未知'（如数量较多或未知内容）。
 
                     特殊情况处理：
-                        同一层级下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。
+                        1.同一层级下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。
+                        2.对于工程施工、建设相关的采购需求，你无需提取，提取的范围仅限软件、硬件，。
 
                     示例输出结构：
                     {{
@@ -246,7 +247,7 @@ def get_technical_requirements(file_path,invalid_path):
                         6.最后一级键内值留空或填'未知'（如数量较多或未知内容）。
 
                         特殊情况处理：
-                            同一层级下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。
+                        同一层级下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。
 
                         示例输出结构：
                         {{
@@ -276,73 +277,74 @@ def get_technical_requirements(file_path,invalid_path):
         file_id=upload_file(invalid_path)
         print("调用invalid_path")
         model_res=qianwen_long(file_id,prompt_template1)
+        print(model_res)
     else:
         user_query=generate_full_user_query(file_path,prompt_template2)
         model_res=doubao_model(user_query)
+        # model_res = qianwen_long(file_id,prompt_template1)
         print(model_res)
-    # res = qianwen_long(file_id, user_query1)
 
-    cleaned_res = clean_json_string(model_res)     #转字典
-    keys_list,good_list,grouped_paths,no_keys_added= generate_key_paths(cleaned_res['采购需求'])  # 提取需要采购的货物清单 key_list：交通监控视频子系统.高清视频抓拍像机 ...
-    if no_keys_added:
-        final_res = postprocess(cleaned_res)
-    else:
-        # user_query_template = "请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
-        user_query_template = """
-请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，键名为\"{}\", 键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串，需与原文完全一致，即若技术参数前存在序号也要保留，但你不可擅自增添或删减。以下为需要考虑的特殊情况：如果该货物没有相关采购要求或技术参数要求，键值为空列表。示例输出格式如下：
-{{
-    "摄像机控制键盘": [
-        "1、支持串行 RS232/RS422 和 IP 混合控制，允许在一个控制器上使用 RS232/RS422/IP 控制单个系统中的摄像机；",
-        "2、支持 2 组 RS422 串口 VISCA 协议菊花链控制 2x7 台摄像机。"
-    ]
-}}
-            """
-        user_query_template_two="""
-请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），由于该货物存在多种不同的采购要求或技术参数，请你请逐一列出，请以json格式返回结果，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增，即第一个键名为\"{}-1\", 键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串，需与原文完全一致，即若技术参数前存在序号也要保留，但你不可擅自增添或删减。示例输出格式如下：
-{{
-    "交换机-1": [
-        "1、支持固化千兆电口≥8 个，固化千兆光口≥2 个，桌面型设备;",
-        "2、支持静态链路聚合"
-    ]
-    "交换机-2":[
-        "1、交换容量≥52Gbps，包转发率≥38.69Mpps，",
-        "2、提供国家强制性产品认证证书及测试报告（3C）"
-    ]
-}}
-        """
-        queries = []
-        for key in keys_list:
-            # 将键中的 '.' 替换为 '下的'
-            modified_key = key.replace('.', '下的')
-            # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
-            new_query = user_query_template.format(modified_key, key, modified_key)
-            queries.append(new_query)
-
-            # 处理 grouped_paths 中的项，应用 user_query_template_two
-        for grouped_key in grouped_paths:
-            # 将键中的 '.' 替换为 '下的'
-            modified_grouped_key = grouped_key.replace('.', '下的')
-            # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
-            new_query = user_query_template_two.format(modified_grouped_key, grouped_key, modified_grouped_key)
-            queries.append(new_query)
-        results = multi_threading(queries, "", file_id, 2)
-        technical_requirements = []
-        if not results:
-            print("errror!未获得大模型的回答！")
-        else:
-            # 打印结果
-            for question, response in results:
-                technical_requirements.append(response)
-        technical_requirements_combined_res = combine_json_results(technical_requirements)
-
-        """根据所有键是否已添加处理技术要求"""
-        # 更新原始采购需求字典
-        final_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
-        # final_res = postprocess(cleaned_res)
-        final_res["货物列表"] = good_list
-
-    # 输出最终的 JSON 字符串
-    return {"采购需求": final_res}
+#     cleaned_res = clean_json_string(model_res)     #转字典
+#     keys_list,good_list,grouped_paths,no_keys_added= generate_key_paths(cleaned_res['采购需求'])  # 提取需要采购的货物清单 key_list：交通监控视频子系统.高清视频抓拍像机 ...
+#     if no_keys_added:
+#         final_res = postprocess(cleaned_res)
+#     else:
+#         # user_query_template = "请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
+#         user_query_template = """
+# 请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），请以json格式返回结果，键名为\"{}\", 键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串，需与原文完全一致，即若技术参数前存在序号也要保留，但你不可擅自增添或删减。以下为需要考虑的特殊情况：如果该货物没有相关采购要求或技术参数要求，键值为空列表。示例输出格式如下：
+# {{
+#     "摄像机控制键盘": [
+#         "1、支持串行 RS232/RS422 和 IP 混合控制，允许在一个控制器上使用 RS232/RS422/IP 控制单个系统中的摄像机；",
+#         "2、支持 2 组 RS422 串口 VISCA 协议菊花链控制 2x7 台摄像机。"
+#     ]
+# }}
+#             """
+#         user_query_template_two="""
+# 请你根据该货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求），由于该货物存在多种不同的采购要求或技术参数，请你请逐一列出，请以json格式返回结果，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增，即第一个键名为\"{}-1\", 键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串，需与原文完全一致，即若技术参数前存在序号也要保留，但你不可擅自增添或删减。示例输出格式如下：
+# {{
+#     "交换机-1": [
+#         "1、支持固化千兆电口≥8 个，固化千兆光口≥2 个，桌面型设备;",
+#         "2、支持静态链路聚合"
+#     ]
+#     "交换机-2":[
+#         "1、交换容量≥52Gbps，包转发率≥38.69Mpps，",
+#         "2、提供国家强制性产品认证证书及测试报告（3C）"
+#     ]
+# }}
+#         """
+#         queries = []
+#         for key in keys_list:
+#             # 将键中的 '.' 替换为 '下的'
+#             modified_key = key.replace('.', '下的')
+#             # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
+#             new_query = user_query_template.format(modified_key, key, modified_key)
+#             queries.append(new_query)
+#
+#             # 处理 grouped_paths 中的项，应用 user_query_template_two
+#         for grouped_key in grouped_paths:
+#             # 将键中的 '.' 替换为 '下的'
+#             modified_grouped_key = grouped_key.replace('.', '下的')
+#             # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
+#             new_query = user_query_template_two.format(modified_grouped_key, grouped_key, modified_grouped_key)
+#             queries.append(new_query)
+#         results = multi_threading(queries, "", file_id, 2)
+#         technical_requirements = []
+#         if not results:
+#             print("errror!未获得大模型的回答！")
+#         else:
+#             # 打印结果
+#             for question, response in results:
+#                 technical_requirements.append(response)
+#         technical_requirements_combined_res = combine_json_results(technical_requirements)
+#
+#         """根据所有键是否已添加处理技术要求"""
+#         # 更新原始采购需求字典
+#         final_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
+#         # final_res = postprocess(cleaned_res)
+#         final_res["货物列表"] = good_list
+#
+#     # 输出最终的 JSON 字符串
+#     return {"采购需求": final_res}
 
 def test_all_files_in_folder(input_folder, output_folder):
     # 确保输出文件夹存在
diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py
index 4b639cd..b907264 100644
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@@ -7,7 +7,8 @@ from flask_app.货物标.商务服务其他要求提取 import get_business_requ
 
 
 #获取采购清单
-def fetch_procurement_reqs(procurement_path, procurement_docpath, invalid_path):
+def fetch_procurement_reqs(procurement_path, invalid_path):
+    # procurement_docpath = pdf2docx(procurement_path)  # 采购需求docx
     # 定义默认的 procurement_reqs 字典
     DEFAULT_PROCUREMENT_REQS = {
         "采购需求": "",
@@ -22,8 +23,6 @@ def fetch_procurement_reqs(procurement_path, procurement_docpath, invalid_path):
         return DEFAULT_PROCUREMENT_REQS.copy()
 
     try:
-        # 上传文件并获取 file_id
-        # file_id = upload_file(procurement_docpath)
 
         # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
         with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -63,7 +62,7 @@ if __name__ == "__main__":
     procurement_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile_procurement.pdf"
     procurement_docpath="C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile_procurement.docx"
     invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile.pdf"
-    res=fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path)
+    res=fetch_procurement_reqs(procurement_path,invalid_path)
     print(json.dumps(res, ensure_ascii=False, indent=4))
     end_time=time.time()
     print("耗时:"+str(end_time-start_time))
diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py
index 4fe5821..245661e 100644
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@@ -54,7 +54,6 @@ def preprocess_files(output_folder, file_path, file_type,logger):
     invalid_path=pdf_path
     invalid_docpath = docx_path  # docx截取无效标部分
     procurement_path = truncate_files[5]  # 采购需求
-    procurement_docpath=pdf2docx(procurement_path) # 采购需求docx
     evaluation_method_path = truncate_files[1]  # 评标办法
     qualification_path = truncate_files[2]  # 资格审查
     tobidders_notice_path = truncate_files[4]  # 投标人须知正文
@@ -70,7 +69,6 @@ def preprocess_files(output_folder, file_path, file_type,logger):
         'invalid_path': invalid_path,
         'output_folder': output_folder,
         'procurement_path': procurement_path,
-        'procurement_docpath':procurement_docpath,
         'evaluation_method_path': evaluation_method_path,
         'qualification_path': qualification_path,
         'notice_path': notice_path,
@@ -80,16 +78,14 @@ def preprocess_files(output_folder, file_path, file_type,logger):
         'merged_baseinfo_path': merged_baseinfo_path
     }
 
-def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path, procurement_path,procurement_docpath, clause_path,logger):
+def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path, procurement_path, clause_path,logger):
     logger.info("starting 基础信息...")
     start_time = time.time()
     if not merged_baseinfo_path:
         merged_baseinfo_path = invalid_path
     if not procurement_path:
         procurement_path=invalid_path
-    if not procurement_docpath:
-        procurement_docpath=invalid_docpath
-    basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path,invalid_path)
+    basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invalid_path)
     base_info, good_list = post_process_baseinfo(basic_res,logger)
     end_time = time.time()
     logger.info(f"基础信息 done，耗时：{end_time - start_time:.2f} 秒")
@@ -207,7 +203,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
                                                               processed_data['clause_path'],logger),
             'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path'],logger),
             'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
-                                         processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path'],logger),
+                                         processed_data['procurement_path'],processed_data['clause_path'],logger),
             'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],
                                                     processed_data['qualification_path'],
                                                     processed_data['notice_path'],logger),
@@ -242,8 +238,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
 
 
 #TODO:把所有未知都删掉。
-#TODO:流式输出
-#TODO:医院优化提示词
+#TODO:考虑把解析失败的调用豆包，全文上传。
 #商务标这里改为列表最里层
 #good_list 金额  截取上下文
 if __name__ == "__main__":