From a94877ddb7fdfcf19deb0e20dd138e9a492f5762 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Mon, 23 Dec 2024 17:29:53 +0800
Subject: [PATCH] =?UTF-8?q?12.23=20=E6=97=A0=E6=95=88=E6=A0=87=E5=BA=9F?=
 =?UTF-8?q?=E6=A0=87=E6=9B=B4=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/无效标和废标公共代码.py | 42 +++++++++++++++--------
 flask_app/test_case/test_正则表达式.py    |  4 +--
 flask_app/货物标/商务服务其他要求提取.py  |  9 ++---
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py
index 807640b..8541dbd 100644
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@@ -265,7 +265,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
                         next_text = processed_paragraphs[current_index].strip()
                         if not found_next_number:
                             # 修改后的正则，支持 '数字 、' 格式
-                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|([（(]\d+[）)])|(\d+\s*、)',
+                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|([（(]\s*\d+\s*[）)])|(\d+\s*、)',
                                                            next_text)
                             if next_section_number:
                                 found_next_number = True
@@ -274,7 +274,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
                                     dynamic_pattern = r'^' + r'[.．]'.join(
                                         [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
                                 elif next_section_number.group(2):
-                                    dynamic_pattern = r'^[\(\（]\d+[\)\）]'
+                                    dynamic_pattern = r'^[\(\（]\s*\d+\s*[\)\）]'
                                 elif next_section_number.group(3):
                                     dynamic_pattern = r'^\d+\s*、'
                                 current_section_pattern = re.compile(dynamic_pattern)
@@ -452,13 +452,13 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords):
 
                     # 清洗文本，去除前缀编号等
 
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z][.．]\s*|[A-Za-z]?\d+\s*([.．]\s*\d+)*(\s|[.．]|、|．)?|[一二三四五六七八九十]+、)'
+                    pattern = r'^\s*(?:[（(]\s*\d+\s*[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）\.、．]?\s*)'
                     full_text = re.sub(pattern, '', full_text).replace(' ', '').strip()
                     sentences2.append(full_text)  # 存储有后续关键词的情况
                     i = end_index if found_next_section else len(split_sentences)
                 else:
                     # 没有后续关键词的情况
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z][.．]\s*|[A-Za-z]?\d+\s*([.．]\s*\d+)*(\s|[.．]|、|．)?|[一二三四五六七八九十]+、)'
+                    pattern = r'^\s*(?:[（(]\s*\d+\s*[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）\.、．]?\s*)'
                     cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ', '').strip()
                     if len(cleaned_sentence) > 8:
                         sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
@@ -510,9 +510,11 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
             r'情\s*况\s*之\s*一',
             r'下\s*列',
             r'以\s*下',
-            r'其\s*他.*?情\s*形\s*[:：]'
+            r'其\s*他.*?情\s*形\s*[:：]',
+            r'包\s*括'
         ]
         extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords)  # 字典结果
+
         all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
 
         # table_data_list=read_docx_last_column(file_path)   #从投标人须知前附表中提取信息生成列表data，每个元素为'一行信息'
@@ -522,6 +524,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
         # Proceed only if there is content to write
         selected_contents = set()  # 使用 set 去重
         if qianwen_txt:
+            print(qianwen_txt)
             with open(output_file, 'w', encoding='utf-8') as file:
                 counter = 1
                 for content in qianwen_txt:
@@ -532,7 +535,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
             model_ans=doubao_model(user_query)   #豆包
             # file_id = upload_file(output_file)
             # model_ans = qianwen_long(file_id, user_query)
-            # model_ans = qianwen_long_text(file_id, user_query)
             num_list = process_string_list(model_ans)
             print(result_key + "选中的序号:" + str(num_list))
 
@@ -577,16 +579,28 @@ def combine_find_invalid(invalid_docpath, output_dir):
             r'视\s*为\s*无\s*效|'
             r'被\s*拒\s*绝|'
             r'予\s*以\s*拒\s*绝',
-            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。
-            文本内容：{full_text}
+            """以下是从招标文件中摘取的内容，文本内信息之间以...............分割。
+请根据该内容回答以下问题：
+否决投标、拒绝投标、无效投标或投标失效的情况有哪些？
+要求与指南：
+    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
+    若存在语义相同或重复的情况，只需返回其中一个序号。
+    请以[x, x, x]格式返回结果，其中x为符合的信息的序号；若没有符合的情况，请返回[]。
+文本内容：{full_text}
             """,
             os.path.join(output_dir, "temp1.txt"),
             "否决和无效投标情形"
         ),
         (
             r'废\s*标',
-            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。
-            文本内容：{full_text}
+            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割。
+请根据该内容回答以下问题：
+废标项的情况有哪些？
+要求与指南：
+    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
+    若存在语义相同或重复的情况，只需返回其中一个序号。
+    请以[x, x, x]格式返回结果，其中x为符合的信息的序号；若没有符合的情况，请返回[]。
+文本内容：{full_text}
             """,
             os.path.join(output_dir, "temp2.txt"),
             "废标项"
@@ -605,9 +619,9 @@ def combine_find_invalid(invalid_docpath, output_dir):
   - `6. 联合体投标各方不得...` → 包含，返回序号 6。
 - **不符合条件**：
   - `14. 采购人不得...` → 主语为“采购人”，排除。
-  
+
 请根据上述筛选要求，阅读以下文本内容，并以 `[x,x,x]` 格式返回符合条件的条款序号，如果没有符合条件的条款，返回 `[]`。
-            
+
 文本内容：{full_text}
             """,
             os.path.join(output_dir, "temp3.txt"),
@@ -656,10 +670,10 @@ if __name__ == '__main__':
     # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
     pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
 
-    output_dir = r"C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\tmp"
+    output_dir = r"D:\flask_project\flask_app\static\output\output1\05298da2-a797-4f77-b7d3-8fb5b401d4c2\tmp"
     # invalid_added=insert_mark(pdf_path)
     # invalid_added_docx=pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\953e3722-f49e-4f2f-b513-513b75894701\invalid_added.docx'
+    invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\05298da2-a797-4f77-b7d3-8fb5b401d4c2\invalid_added.docx'
     results = combine_find_invalid(invalid_added_docx, output_dir)
     end_time = time.time()
     print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
diff --git a/flask_app/test_case/test_正则表达式.py b/flask_app/test_case/test_正则表达式.py
index 7c6c711..29a4c05 100644
--- a/flask_app/test_case/test_正则表达式.py
+++ b/flask_app/test_case/test_正则表达式.py
@@ -2,14 +2,14 @@ import re
 
 # 定义清理函数
 def clean_data(data):
-    pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）\.、．]?\s*)'
+    pattern = r'^\s*(?:[（(]\s*\d+\s*[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）\.、．]?\s*)'
     return re.sub(pattern, '', data).strip()
 
 # 定义测试用例
 test_cases = [
     {
         "description": "阿拉伯数字加逗号",
-        "input": "2 、单位负责人为同一人或者存在直接控股 、管理关系的不同投标人， 不得 参加本项目同一合同项下的政府采购活动。",
+        "input": "（ 2 ）单位负责人为同一人或者存在直接控股 、管理关系的不同投标人， 不得 参加本项目同一合同项下的政府采购活动。",
         "expected": "单位负责人为同一人或者存在直接控股 、管理关系的不同投标人， 不得 参加本项目同一合同项下的政府采购活动。"
     },
     {
diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py
index 8cc01c9..c48a486 100644
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@@ -215,8 +215,9 @@ def generate_template(required_keys,full_text, type=1):
     
     注意事项：
     1. 提取的要求应为采购、招标活动或项目的整体要求，而非针对具体采购物品的技术参数或功能要求。
-    2. 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求，可以将它忽略而不是将它与下文具体要求进行多行合并，或者作为该要求下的嵌套键名，总之字符串列表中只提取具体的要求。
-    3. 请不要提取{another_keys_str}中的内容。
+    2. 采购要求是针对投标人、中标人、供应商等投标相关主体的具体要求，避免回答如行政性要求（投标文件的提交方式、截止时间、地点等）、招标活动流程、答疑相关内容等。
+    3. 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求，可以将它忽略而不是将它与下文具体要求进行多行合并，或者作为该要求下的嵌套键名，总之字符串列表中只提取具体的要求。
+    4. 请不要提取{another_keys_str}中的内容。
 
     要求与指南：
     1. JSON 的结构要求：
@@ -351,11 +352,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
 if __name__ == "__main__":
     # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
     # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件（广水市教育局封闭管理）_procurement.pdf"
-    procurement_path=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\ztbfile_procurement.pdf'
+    procurement_path=r'D:\flask_project\flask_app\static\output\output1\8bb07ee1-bcbb-4244-9d1e-367a783f1e40\invalid_del.pdf'
     docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
     # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
     # file_id = upload_file(truncate_file)
     # processed_filepath = pdf2txt(procurement_path)
-    processed_filepath=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\extract1.txt'
+    processed_filepath=r'D:\flask_project\flask_app\static\output\output1\8bb07ee1-bcbb-4244-9d1e-367a783f1e40\extract1.txt'
     final_res= get_business_requirements(procurement_path,processed_filepath,1)
     print(json.dumps(final_res, ensure_ascii=False, indent=4))