From dc91f1d9b509bacc25f1e0b0ae2a241fc5035073 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Fri, 15 Nov 2024 14:43:10 +0800
Subject: [PATCH] =?UTF-8?q?11.15=20=E5=B7=A5=E7=A8=8B=E6=A0=87=E8=B5=84?=
 =?UTF-8?q?=E6=A0=BC=E5=AE=A1=E6=9F=A5=E6=8F=90=E7=A4=BA=E8=AF=8D=E9=87=8D?=
 =?UTF-8?q?=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/format_change.py            |  1 -
 .../general/投标人须知正文提取指定内容.py     |  2 ++
 flask_app/general/无效标和废标公共代码.py     |  1 -
 flask_app/general/读取文件/按页读取pdf.py     |  5 +----
 flask_app/main/基础信息整合快速版.py          |  2 +-
 flask_app/main/工程标解析main.py              |  3 +--
 flask_app/main/形式响应评审.py                |  1 +
 flask_app/main/截取pdf.py                     |  6 +-----
 flask_app/main/截取pdf_old.py                 |  1 -
 ...指定内容.py => 投标人须知正文提取指定内容工程标.py} | 14 +++++++------
 flask_app/main/解析old.py                     |  2 +-
 flask_app/main/资格评审.py                    |  1 -
 flask_app/old_version/基础信息整合.py         |  2 +-
 flask_app/old_version/招标文件解析.py         |  2 +-
 flask_app/货物标/技术参数要求提取.py          | 20 ++++++++++++++++---
 15 files changed, 35 insertions(+), 28 deletions(-)
 rename flask_app/main/{投标人须知正文提取指定内容.py => 投标人须知正文提取指定内容工程标.py} (93%)

diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py
index e6dc960..143d283 100644
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@@ -176,7 +176,6 @@ def docx2pdf(local_path_in):
 #     return output_path
 
 
-#TODO:6.2定版视频会议磋商文件.doc文件转换有问题
 if __name__ == '__main__':
     # 替换为你的文件路径和API URL
     # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py
index 44cb48f..eb9fad1 100644
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@@ -185,6 +185,8 @@ def process_nested_data(data):
 
 #生成无结构的数据货物标
 def concatenate_keys_values(section_content):
+    print("-------------")
+    print(json.dumps(section_content, ensure_ascii=False, indent=4))
     """
     将章节内容的键值对拼接成一个字符串列表，每个元素为 "key value"。
 
diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py
index 0f947ac..2bce329 100644
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@@ -574,7 +574,6 @@ def combine_find_invalid(file_path, output_dir):
     print("无效标与废标done...")
     return {"无效标与废标项": combined_dict}
 
-
 # TODO:无效标目前以整个docx文档作为输入，可能导致后面两章不必要的信息也导入。 无效投标至少>8个字
 if __name__ == '__main__':
     start_time = time.time()
diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py
index c1d621d..64d1a86 100644
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@@ -3,8 +3,6 @@ from flask_app.general.clean_pdf import extract_common_header, clean_page_conten
 
 def extract_text_by_page(file_path):
     common_header = extract_common_header(file_path)
-    print(f"公共抬头：{common_header}")
-    print("--------------------正文开始-------------------")
     result = ""
     with open(file_path, 'rb') as file:
         reader = PyPDF2.PdfReader(file)
@@ -14,7 +12,6 @@ def extract_text_by_page(file_path):
             page = reader.pages[page_num]
             text = page.extract_text()
             if text:
-                print("-------------------")
                 cleaned_text = clean_page_content(text,common_header)
                 print(cleaned_text)
                 result += cleaned_text
@@ -96,7 +93,7 @@ def extract_text_by_page(file_path):
 
 if __name__ == '__main__':
     # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    file_path =r"C:\Users\Administrator\Desktop\货物标\output4_2\招标文件111_tobidders_notice_part2.pdf"
+    file_path=r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-11\17F7BF97-1A4D-427D-81F2-5C9AD4B097DB_DF07_Flatten\17F7BF97-1A4D-427D-81F2-5C9AD4B097DB_DF07_Flatten_1-524.pdf'
     # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
     # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
     # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
diff --git a/flask_app/main/基础信息整合快速版.py b/flask_app/main/基础信息整合快速版.py
index 0bd187e..c85d6a6 100644
--- a/flask_app/main/基础信息整合快速版.py
+++ b/flask_app/main/基础信息整合快速版.py
@@ -4,7 +4,7 @@ import time
 import concurrent.futures
 from flask_app.general.json_utils import clean_json_string, rename_outer_key
 from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 from flask_app.main.判断是否分包等 import read_questions_from_judge, merge_json_to_list
 from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.general.通义千问long import upload_file,qianwen_long
diff --git a/flask_app/main/工程标解析main.py b/flask_app/main/工程标解析main.py
index b83ef74..3f0009e 100644
--- a/flask_app/main/工程标解析main.py
+++ b/flask_app/main/工程标解析main.py
@@ -12,7 +12,7 @@ from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.general.无效标和废标公共代码 import combine_find_invalid
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.main.基础信息整合快速版 import combine_basic_info
 from flask_app.main.资格审查模块 import combine_review_standards
@@ -223,7 +223,6 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
                 logger.error(f"Error processing {key}: {exc}")
                 yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
 
-#TODO:废标项，针对新文件作优化，统一成货物标的处理逻辑
 #TODO:基本信息，判断是否这里，打勾逻辑取消了。
 #TODO:缩进
 if __name__ == "__main__":
diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py
index a72c78c..2cb444e 100644
--- a/flask_app/main/形式响应评审.py
+++ b/flask_app/main/形式响应评审.py
@@ -317,6 +317,7 @@ def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, claus
             file_id = upload_file(output_path)
             results = multi_threading(formatted_questions, "", file_id, 2)
             first_response_list = [clean_json_string(res) for _, res in results] if results else []
+            print(first_response_list)
 
     updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
     return updated_json
diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py
index d8c55d1..9851a31 100644
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@@ -580,18 +580,14 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
         logger.error(f"Error in truncate_pdf_specific_engineering: {e}")
         return [""] * len(selections)  # 返回与 selections 数量相同的空字符串列表
 
-
-# TODO:需要完善二次请求。目前invalid一定能返回 前附表  须知正文如果为空的话要额外处理一下，比如说就不进行跳转（见xx表） 开评定标这里也要考虑  如果评分表为空，也要处理。
 # TODO:目前merged_baseinfo没有包含投标人须知正文。
 
-#TODO:zbtest20有问题
-# 投标人须知前附表改为货物标一样的
 if __name__ == "__main__":
     start_time = time.time()
     # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
     # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
     # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
-    input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4.pdf"
+    input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
     output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
     files=truncate_pdf_multiple(input_path,output_folder)
     # selections = [4, 1]  # 仅处理 selection 4、1
diff --git a/flask_app/main/截取pdf_old.py b/flask_app/main/截取pdf_old.py
index 0b9861f..51280ee 100644
--- a/flask_app/main/截取pdf_old.py
+++ b/flask_app/main/截取pdf_old.py
@@ -515,7 +515,6 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
         return [""] * len(selections)  # 返回与 selections 数量相同的空字符串列表
 
 
-# TODO:需要完善二次请求。目前invalid一定能返回 前附表  须知正文如果为空的话要额外处理一下，比如说就不进行跳转（见xx表） 开评定标这里也要考虑  如果评分表为空，也要处理。
 #TODO:zbtest8 zbtest18有问题  后期需要完善，截取需要截两次，第一次严格第二次宽松
 if __name__ == "__main__":
     input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest8.pdf"
diff --git a/flask_app/main/投标人须知正文提取指定内容.py b/flask_app/main/投标人须知正文提取指定内容工程标.py
similarity index 93%
rename from flask_app/main/投标人须知正文提取指定内容.py
rename to flask_app/main/投标人须知正文提取指定内容工程标.py
index 3359b01..cfba78f 100644
--- a/flask_app/main/投标人须知正文提取指定内容.py
+++ b/flask_app/main/投标人须知正文提取指定内容工程标.py
@@ -115,12 +115,14 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
             "Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'")
     with open(clause_path, 'r', encoding='utf-8') as file:
         data = json.load(file)
-        extracted_data = extract_between_sections(data, target_values)
+        extracted_data = extract_between_sections(data, target_values) #先使用大章节'二、投标文件'这种筛选
         if not extracted_data:
-            extracted_data = extract_json(data, target_values)  # 提取需要的数据
+            extracted_data = extract_json(data, target_values)  # 若没有，再使用'3.投标文件' 筛选
             if not extracted_data:
-                final_result = get_requirements_with_gpt(merged_baseinfo_path, type)  # 万一没用正则匹配到，那就调用大模型
+                final_result = get_requirements_with_gpt(merged_baseinfo_path, type)  # 万一都没，那就调用大模型
                 return final_result
+            print("老方法")
+            print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
             final_result=extract_sections(extracted_data,target_values)  #后处理，生成键名
             return final_result
         else:
@@ -138,10 +140,10 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
 
 if __name__ == "__main__":
     # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
-    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\merged_baseinfo_path_more.pdf"
-    clause_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\tmp\clause1.json"
+    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\05339b83-50bf-4405-905c-38625928840e\merged_baseinfo_path_more.pdf"
+    clause_path=r"D:\flask_project\flask_app\static\output\output1\05339b83-50bf-4405-905c-38625928840e\clause1.json"
     try:
-        res = extract_from_notice(merged_baseinfo_path,clause_path, 3)  # 可以改变此处的 type 参数测试不同的场景
+        res = extract_from_notice(merged_baseinfo_path,clause_path, 1)  # 可以改变此处的 type 参数测试不同的场景
         res2 = json.dumps(res, ensure_ascii=False, indent=4)
         print(res2)
     except ValueError as e:
diff --git a/flask_app/main/解析old.py b/flask_app/main/解析old.py
index 3ba3b10..7e545e2 100644
--- a/flask_app/main/解析old.py
+++ b/flask_app/main/解析old.py
@@ -8,7 +8,7 @@ from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.main.基础信息整合快速版 import combine_basic_info
 from flask_app.main.资格审查模块 import combine_review_standards
diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py
index 1ba3469..daaf3ac 100644
--- a/flask_app/main/资格评审.py
+++ b/flask_app/main/资格评审.py
@@ -134,7 +134,6 @@ def get_consortium_dict(merged_baseinfo_path):
         consortium_dict = clean_json_string(results1)
     return consortium_dict
 
-#TODO：修改问题
 def get_all_dict(invalid_path, ques=None):
     if ques is None:
         ques = []
diff --git a/flask_app/old_version/基础信息整合.py b/flask_app/old_version/基础信息整合.py
index c7514fc..e8be244 100644
--- a/flask_app/old_version/基础信息整合.py
+++ b/flask_app/old_version/基础信息整合.py
@@ -1,7 +1,7 @@
 import json
 
 from flask_app.general.json_utils import clean_json_string, rename_outer_key
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
 from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.general.通义千问long import upload_file
diff --git a/flask_app/old_version/招标文件解析.py b/flask_app/old_version/招标文件解析.py
index 5b1eee0..8779306 100644
--- a/flask_app/old_version/招标文件解析.py
+++ b/flask_app/old_version/招标文件解析.py
@@ -9,7 +9,7 @@ from flask_app.old_version.文档理解大模型版知识库处理.知识库操
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.old_version.基础信息整合 import combine_basic_info
 from flask_app.old_version.资格审查模块old import combine_review_standards
diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py
index b921d02..050fc20 100644
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@@ -167,7 +167,7 @@ def get_technical_requirements(file_id,invalid_path):
         file_id=upload_file(invalid_path)
         print("调用invalid_path")
     user_query1 = """
-请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，注意不要返回'说明'或'规格'或'技术参数'列中的内容；若没有采购清单，你要从表格中或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位。以下为需要考虑的特殊情况：如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增,例如若采购清单中有两种型号的'交换机'，那么你应返回两个键名，'交换机-1'和'交换机-2'；如有未知内容，在对应键值处填'未知'。以下为考虑了特殊情况的示例输出：
+请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，注意不要返回'说明'或'规格'或'技术参数'列中的内容；若没有采购清单，你要从表格中或文中摘取需要采购的系统和货物，采购需求中可能包含层次关系，例如采购的某系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位。以下为需要考虑的特殊情况：如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增,例如若采购清单中有两种型号的'交换机'，那么你应返回两个键名，'交换机-1'和'交换机-2'；如有未知内容，在对应键值处填'未知'。以下为考虑了特殊情况的示例输出：
 {
     "采购需求": {
         "交换机-1"：{},
@@ -181,6 +181,20 @@ def get_technical_requirements(file_id,invalid_path):
     }
 }
     """
+#     user_query1 = """
+#     请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我该项目需要采购的系统或货物，要求回答全面不要遗漏，最细提取到具体的货物名称，而不是货物的功能需求。如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，注意不要返回'说明'或'规格'或'技术参数'列中的内容；若没有采购清单，你要从表格中或文中摘取需要采购的系统和货物。请以json格式返回结果，外层键名为采购的系统或货物名称，注意：采购需求中可能包含层次关系，例如采购的某系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，此时内层键名为该系统所需的货物名，需与原文保持一致，无需给出采购数量和单位。以下为需要考虑的特殊情况：如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增,例如若采购清单中有两种型号的'交换机'，那么你应返回两个键名，'交换机-1'和'交换机-2'；如果采购的某系统说明了该系统整体功能，那么在其内层键名中除了有该系统包含的货物，还应包含'系统功能'，具体键名同原文中的描述；如有未知内容，在对应键值处填'未知'。以下为考虑了特殊情况的示例输出：
+#     {
+#         "交换机-1"：{},
+#         "交换机-2":{},
+#         "门禁管理系统": {},
+#         "交通监控视频子系统": {
+#             "系统功能":{}
+#             "高清视频抓拍像机":{},
+#             "补光灯":{}
+#         },
+#         "LED全彩显示屏": {}
+#     }
+#         """
     res = qianwen_long(file_id, user_query1)
     print(res)
     cleaned_res = clean_json_string(res)     #转字典
@@ -271,8 +285,8 @@ def test_all_files_in_folder(input_folder, output_folder):
                 print(f"处理文件 {file_path} 时出错: {e}")
 
 if __name__ == "__main__":
-    truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
-    # # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
+    # truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
+    truncate_file=r"C:\Users\Administrator\Desktop\货物标\output1\招标文件(107国道)_procurement.docx"
     # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
     # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
     # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"