From 04b28a6028c61d25f595b3927594d76d228fb27e Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Fri, 6 Sep 2024 17:00:35 +0800
Subject: [PATCH] =?UTF-8?q?9.6=E9=83=A8=E5=88=86=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E5=BC=95=E7=94=A8=E4=BA=86=E6=8B=9B=E6=A0=87=E5=85=AC=E5=91=8A?=
 =?UTF-8?q?=E4=B8=AD=E7=9A=84=E5=86=85=E5=AE=B9=EF=BC=8C=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E4=BA=86=E5=AF=B9=E4=BA=8C=E6=AC=A1=E8=B7=B3=E8=BD=AC=E7=9A=84?=
 =?UTF-8?q?=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/main/table_content_extraction.py    | 27 +++++++++++++------
 flask_app/main/基础信息整合.py                |  9 +++----
 flask_app/main/形式响应评审.py                |  5 ++--
 flask_app/main/截取pdf.py                     |  6 ++---
 flask_app/main/投标人须知正文提取指定内容.py  |  4 +--
 .../main/投标人须知正文条款提取成json文件.py  |  3 +++
 flask_app/main/招标文件解析.py                | 13 ++++-----
 flask_app/main/资格评审.py                    |  4 +--
 8 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py
index a678448..6dcfb6b 100644
--- a/flask_app/main/table_content_extraction.py
+++ b/flask_app/main/table_content_extraction.py
@@ -1,3 +1,5 @@
+import os
+
 from docx import Document
 import json
 
@@ -57,6 +59,7 @@ def read_tables_from_docx(file_path):
 
     return table_list
 
+
 def flatten_nested_dicts(d):
     """平坦化嵌套字典，以便更简洁地保存为JSON."""
     keys_to_remove = []
@@ -77,12 +80,21 @@ def flatten_nested_dicts(d):
 
     return d
 
-def save_data_to_json(data, filename):
-    """将数据保存到JSON文件中."""
-    with open(filename, 'w', encoding='utf-8') as file:
-        json.dump(data, file, ensure_ascii=False, indent=4)
 
-def extract_tables_main(path, output_filename):
+def save_data_to_json(data, output_folder):
+    filename = "truncate_output.json"
+    output_filepath = os.path.join(output_folder, filename)
+    """将数据保存到JSON文件中."""
+    with open(output_filepath, 'w', encoding='utf-8') as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+    print(f"The data has been processed and saved to '{output_filepath}'.")
+    return output_filepath
+
+
+def extract_tables_main(path, output_folder):
+    if not os.path.exists(path):
+        print(f"The specified file does not exist: {path}")
+        return ""
     # 读取文档表格数据
     table_data = read_tables_from_docx(path)
 
@@ -90,11 +102,10 @@ def extract_tables_main(path, output_filename):
     flattened_data = flatten_nested_dicts(table_data)
 
     # 保存平坦化后的数据到JSON文件
-    save_data_to_json(flattened_data, output_filename)
+    return save_data_to_json(flattened_data, output_folder)
 
-    print(f"The data has been processed and saved to '{output_filename}'.")
 
 if __name__ == "__main__":
-    path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\zbtest20_17-22.docx'
+    path = ''
     output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\truncate_output.json"  # 前附表json文件
     extract_tables_main(path, output_filename)
diff --git a/flask_app/main/基础信息整合.py b/flask_app/main/基础信息整合.py
index d15e48e..48ae62b 100644
--- a/flask_app/main/基础信息整合.py
+++ b/flask_app/main/基础信息整合.py
@@ -81,8 +81,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path):    #
     # 调用大模型回答项目基础信息
     print("starting基础信息...")
     baseinfo_list = []
-    baseinfo_file_path='../static/提示词/前两章提问总结.txt'
-    # baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt'  # 替换为你的txt文件路径
+    # baseinfo_file_path='../static/提示词/前两章提问总结.txt'
+    baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt'  # 替换为你的txt文件路径
     questions = read_questions_from_file(baseinfo_file_path)
     res1 = multi_threading(questions, knowledge_name)
     for _, response in res1:  # _占位，代表ques;response[0]也是ques;response[1]是ans
@@ -97,11 +97,10 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path):    #
     # 判断是否分包、是否需要递交投标保证金等
     chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
     baseinfo_list.append(merged)
-    judge_file_path = '../static/提示词/是否相关问题.txt'
-    # judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
+    # judge_file_path = '../static/提示词/是否相关问题.txt'
+    judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
     judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
 
-
     judge_consortium = judge_consortium_bidding(baseinfo_list)     #通过招标公告判断是否接受联合体投标
     if judge_consortium:
         judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的，请按json格式给我提供信息，外层键名为'联合体投标要求'，其中有一个嵌套键值对为：\"是否接受联合体投标\":\"是\""
diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py
index bcf87ba..27911b2 100644
--- a/flask_app/main/形式响应评审.py
+++ b/flask_app/main/形式响应评审.py
@@ -125,7 +125,6 @@ def extract_matching_keys(json_data):
 
     return final_matching
 
-#TODO:如果要引用到招标公告中的内容，考虑提取  或者qianwen-long
 def reformat_questions(match_keys, input_path, output_folder):
     entries_with_numbers = []
     entries_with_numbers2 = []  # 用于保存特别处理的条目
@@ -176,11 +175,11 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
     entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
     combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
                                                  clause_json_path)  # 调用根据条款号整合json.py
-    combined_results1, formatted_questions2 = judge_second_jump(combined_results)
+    combined_results1, formatted_questions2 = judge_second_jump(combined_results)    #判断是否需要二次跳转
     if entries_with_numbers2:                           #跳转第一章招标公告
         combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
 
-    results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name, True)    #无序号的直接问大模型
+    results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name)    #无序号的直接问大模型
     first_response_list = []
     for _, response in results_2:
         try:
diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py
index d8868a0..ae63796 100644
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@@ -152,7 +152,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
     # 确保找到了起始和结束页面
     if start_page is None or end_page is None:
         if output_suffix == "qualification" or output_suffix =="invalid":
-            extract_pages_twice(pdf_path, output_folder, output_suffix)
+            return extract_pages_twice(pdf_path, output_folder, output_suffix)
         else:
             print(f"未找到起始或结束页在文件 {pdf_path} 中！")
             return ""
@@ -249,9 +249,9 @@ def truncate_pdf_multiple(input_path, output_folder):
 
 # TODO:需要完善二次请求。目前invalid一定能返回 前附表  须知正文如果为空的话要额外处理一下，比如说就不进行跳转（见xx表） 开评定标这里也要考虑  如果评分表为空，也要处理。
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest18.pdf"
+    input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest20.pdf"
     output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
     # truncate_pdf_multiple(input_path,output_folder)
-    selection = 5  # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
+    selection = 4  # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
     generated_files = truncate_pdf_main(input_path, output_folder, selection)
     # # print("生成的文件:", generated_files)
diff --git a/flask_app/main/投标人须知正文提取指定内容.py b/flask_app/main/投标人须知正文提取指定内容.py
index c65c799..20c620c 100644
--- a/flask_app/main/投标人须知正文提取指定内容.py
+++ b/flask_app/main/投标人须知正文提取指定内容.py
@@ -94,7 +94,7 @@ def transform_json(data):
 
 
 # 读取JSON数据，提取内容，转换结构，并打印结果
-def extract_from_notice(file_path, type):
+def extract_from_notice(clause_path, type):
     if type == 1:
         target_values = ["投标文件", "投标"]
     elif type == 2:
@@ -103,7 +103,7 @@ def extract_from_notice(file_path, type):
         target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
     else:
         raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.")
-    with open(file_path, 'r', encoding='utf-8') as file:
+    with open(clause_path, 'r', encoding='utf-8') as file:
         data = json.load(file)
         extracted_data = extract_json(data, target_values)  # 读取json
         transformed_data = transform_json(extracted_data)
diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py
index 70e141a..917bd55 100644
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@@ -125,6 +125,9 @@ def convert_to_json(file_path, start_word, end_phrases):
     return parsed_data
 
 def convert_clause_to_json(input_path,output_folder,type=1):
+    if not os.path.exists(input_path):
+        print(f"The specified file does not exist: {input_path}")
+        return ""
     if type==1:
         start_word = "投标人须知正文"
         end_phrases = [
diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py
index a101b5e..e145a2d 100644
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@@ -43,15 +43,15 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
     index = addfileToKnowledge(docx_path, knowledge_name)
 
     # 调用截取PDF多次
-    truncate_files = truncate_pdf_multiple(pdf_path, output_folder)  # [前附表, 评标办法, 须知正文, 资格审查条件，无效标]
+    truncate_files = truncate_pdf_multiple(pdf_path, output_folder)  # [前附表, 评标办法, 须知正文, 资格审查条件]
+    print(truncate_files)
 
     # 处理各个部分
     truncate0_docpath = pdf2docx(truncate_files[0])  # 投标人须知前附表转docx
 
     invalid_docpath = copy_docx(docx_path)     #docx截取无效标部分
 
-    truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
-    extract_tables_main(truncate0_docpath, truncate0_jsonpath)  # 投标人须知前附表docx->json,从表格提取数据
+    truncate_jsonpath=extract_tables_main(truncate0_docpath, output_folder)  # 投标人须知前附表docx->json,从表格提取数据
     truncate0 = truncate_files[0]     #投标人须知前附表
     truncate1 = truncate_files[1]     #评标办法前附表
     truncate3 = truncate_files[3]     #资格审查表
@@ -65,7 +65,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
         'truncate3': truncate3,
         'knowledge_index': index,
         'knowledge_name': knowledge_name,
-        'truncate0_jsonpath': truncate0_jsonpath,
+        'truncate0_jsonpath': truncate_jsonpath,
         'clause_path': clause_path,
         'invalid_docpath': invalid_docpath
     }
@@ -215,7 +215,6 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
 #
 #     deleteKnowledge(processed_data['knowledge_index'])
 
-# TODO:如果上传的是pdf转过的docx文件，那么提取打勾符号就会有问题  zbtest20   跳转涉及二级跳转   对于跳转到第一章 招标公告的要做额外处理     资格审查位置在第一章后面。如果未截取成功，需要作额外处理      logger不能保存控制台输出
 if __name__ == "__main__":
     output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3"
 
@@ -227,7 +226,9 @@ if __name__ == "__main__":
     start_time = time.time()
     file_type = 1        #1:docx 2:pdf 3:其他
     input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3\\zbtest20.docx"
-    file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
+    # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
+
+    preprocess_files(output_folder, input_file, file_type, "unique_id")
     end_time = time.time()
     elapsed_time = end_time - start_time  # 计算耗时
     print(f"Function execution took {elapsed_time} seconds.")
diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py
index 1d94dff..9ceceac 100644
--- a/flask_app/main/资格评审.py
+++ b/flask_app/main/资格评审.py
@@ -78,8 +78,8 @@ def get_consortium_dict(knowledge_name):
     return consortium_dict
 
 def get_all_dict(knowledge_name):
-    qualification_review_file_path = '../static/提示词/资格评审.txt'  # 替换为你的txt文件路径
-    # # qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
+    # qualification_review_file_path = '../static/提示词/资格评审.txt'  # 替换为你的txt文件路径
+    qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
     questions = read_questions_from_file(qualification_review_file_path)
     qualification_list = []
     res1 = multi_threading(questions, knowledge_name)