From 2226d27a3cf9183d1a07001c26822c7a953ddfe9 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Fri, 11 Oct 2024 11:08:38 +0800
Subject: [PATCH] =?UTF-8?q?=E6=97=A0=E6=95=88=E6=A0=87=E5=BA=9F=E6=A0=87?=
 =?UTF-8?q?=E6=8F=90=E5=8F=96=E4=BC=98=E5=8C=96=E7=89=88=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/main/table_content_extraction.py    |   4 +-
 flask_app/main/无效标和废标和禁止投标整合.py  |  45 ++++----
 flask_app/main/读取文件/读取docx.py           |  38 ++++++-
 .../无效标和废标和禁止投标整合货物标版.py     | 101 ++++++++++++------
 4 files changed, 130 insertions(+), 58 deletions(-)

diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py
index ef96f32..c719523 100644
--- a/flask_app/main/table_content_extraction.py
+++ b/flask_app/main/table_content_extraction.py
@@ -130,8 +130,8 @@ def process_all_part1_pdfs(folder_path, output_folder):
                 extract_tables_main(file_path, subfolder_path)
 
 if __name__ == "__main__":
-    path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件111_tobidders_notice_part1.docx'
-    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp"  # 前附表json文件
+    path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
+    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp"  # 前附表json文件
     res=extract_tables_main(path, output_folder)
     #
     # folder_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4'
diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py
index 36d0418..1db1254 100644
--- a/flask_app/main/无效标和废标和禁止投标整合.py
+++ b/flask_app/main/无效标和废标和禁止投标整合.py
@@ -133,7 +133,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达
                 if any(exclude in data for exclude in excludes):
                     continue  # 如果包含任何排除字符串，跳过这个数据
                 # 去掉开头的序号，包括字母+数字的格式 以及括号+数字
-                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
+                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                 data = re.sub(pattern, '', data).strip()
                 keyword_match = re.search(keywords, data)
                 if keyword_match:
@@ -158,7 +158,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达
             # print(text_list)
             new_text_list=preprocess_text_list(text_list)
             # print(new_text_list)
-            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
+            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
             data = re.sub(pattern, '', new_text_list[0]).strip()     #去除序号
             # 将修改后的第一个元素和剩余的元素连接起来
             new_text_list[0] = data  # 更新列表中的第一个元素
@@ -183,7 +183,8 @@ def find_sentences_with_keywords(data, keywords, follow_up_keywords):
             sentences2.extend(result2)
     elif isinstance(data, str):
         # 分割句子，保证句子完整性（按标点符号和序号分割）
-        split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)  # 扩展匹配序号分割
+        # split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)  # 扩展匹配序号分割
+        split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)
         i = 0
         while i < len(split_sentences):
             sentence = split_sentences[i].strip()
@@ -205,12 +206,14 @@ def find_sentences_with_keywords(data, keywords, follow_up_keywords):
                         full_text = ' '.join(split_sentences[start_index:end_index]).strip()
                     else:
                         full_text = ' '.join(split_sentences[start_index:]).strip()
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                     data=re.sub(pattern,'',full_text)
                     sentences2.append(data)  # 存储有后续关键词的情况
                     i = end_index if found_next_section else len(split_sentences)
                 else:
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                     data = re.sub(pattern, '', sentence).replace('\n','').strip()
                     sentences1.append(data)  # 存储没有后续关键词的情况
                     i += 1
@@ -274,36 +277,36 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
     all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
     all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)   #提取表格数据（json_data）
     qianwen_txt = all_texts1 + all_tables1
-    # Proceed only if there is content to write
+    selected_contents = set()  # 使用 set 去重
+
     if qianwen_txt:
         with open(output_file, 'w', encoding='utf-8') as file:
-            # 初始化一个计数器
             counter = 1
             for content in qianwen_txt:
-                file.write("..............."+'\n')
-                # 写入内容前加上序号，后面接一个点和空格，然后是内容
+                file.write("..............." + '\n')
                 file.write(f"{counter}. {content}\n")
-                # 更新计数器，每次循环递增
                 counter += 1
+
         file_id = upload_file(output_file)
         qianwen_ans = qianwen_long(file_id, user_query)
-        selected_contents = []
         num_list = process_string_list(qianwen_ans)
         print(num_list)
 
         for index in num_list:
             if index - 1 < len(qianwen_txt):
-                content = qianwen_txt[index - 1]  # 转换序号为索引（假设序号从1开始）
-                selected_contents.append(content)
-        selected_contents += all_texts2
-        selected_contents += all_tables2
-        # 创建一个字典来保存结果
-        res = {result_key: selected_contents}
-        # 将结果转换为JSON字符串
-        # os.remove(output_file)  # Remove the file after use
-        # print(f"Deleted temporary file: {output_file}")
+                content = qianwen_txt[index - 1]
+                selected_contents.add(content)
+
+    # 无论 qianwen_txt 是否为空，都添加 all_texts2 和 all_tables2 的内容
+    selected_contents.update(all_texts2)
+    selected_contents.update(all_tables2)
+
+    # 如果 selected_contents 不为空，则返回结果，否则返回空字符串
+    if selected_contents:
+        res = {result_key: list(selected_contents)}
     else:
-        res = {result_key: ""}  # Set the response to empty if no contents were extracted
+        res = {result_key: ""}
+
     return res
 
 def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
diff --git a/flask_app/main/读取文件/读取docx.py b/flask_app/main/读取文件/读取docx.py
index 5435f89..87b3edf 100644
--- a/flask_app/main/读取文件/读取docx.py
+++ b/flask_app/main/读取文件/读取docx.py
@@ -36,7 +36,41 @@ def read_docx_tables(file_path):
             print(f"Row {row_idx + 1}: {row_data}")
         print("\n" + "-" * 40 + "\n")  # 打印分隔线
 
+def read_tables_from_docx(file_path):
+    # 尝试打开文档
+    try:
+        doc = Document(file_path)
+    except Exception as e:
+        print(f"Error opening file: {e}")
+        return []
+
+    # 初始化列表来保存符合条件的单元格内容
+    cell_contents = []
+
+    # 读取文档中的所有表格
+    if not doc.tables:
+        print("No tables found in the document.")
+        return []
+
+    # 遍历文档中的每个表格
+    for table_idx, table in enumerate(doc.tables):
+        # 遍历表格中的每一行
+        for row_idx, row in enumerate(table.rows):
+            # 遍历每一行中的单元格
+            for cell in row.cells:
+                cell_text = cell.text.strip()  # 去除单元格内容前后空白
+                if len(cell_text) > 6:  # 检查文字数量是否大于5
+                    cell_contents.append(cell_text)
+
+    # 返回符合条件的单元格内容
+    return cell_contents
+
 if __name__ == "__main__":
-    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件(1)_tobidders_notice_part1.docx"
+    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
+    # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp"  # 前附表json文件
     # read_docx(file_path)
-    read_docx_tables(file_path)
\ No newline at end of file
+    read_docx_tables(file_path)
+    list=read_tables_from_docx(file_path)
+    for i in list:
+        print(i)
+        print("--------------")
\ No newline at end of file
diff --git a/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py b/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py
index 72402ae..1c8756f 100644
--- a/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py
+++ b/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py
@@ -3,7 +3,6 @@ import json
 import os.path
 import time
 import re
-from flask_app.main.json_utils import combine_json_results, nest_json_under_key
 from flask_app.main.通义千问long import upload_file, qianwen_long
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
@@ -147,8 +146,8 @@ def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达
                 # 检查是否包含任何需要排除的字符串
                 if any(exclude in data for exclude in excludes):
                     continue  # 如果包含任何排除字符串，跳过这个数据
-                # 去掉开头的序号，包括字母+数字的格式 以及括号+数字
-                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
+                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
+                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                 data = re.sub(pattern, '', data).strip()
                 keyword_match = re.search(keywords, data)
                 if keyword_match:
@@ -172,7 +171,8 @@ def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达
         else:
             new_text_list=preprocess_text_list(text_list)
             #用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
-            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
+            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+
             data = re.sub(pattern, '', new_text_list[0]).strip()     #去除序号
             # 将修改后的第一个元素和剩余的元素连接起来
             new_text_list[0] = data  # 更新列表中的第一个元素
@@ -181,10 +181,11 @@ def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达
 
     return all_texts1,all_texts2        #all_texts1要额外用gpt   all_text2直接返回结果
 
-def read_docx_last_column(file_path):
+#只读取前附表中的最后一列（省钱，但容易漏内容）
+def read_docx_last_column(truncate_file):
     # 尝试打开文档
     try:
-        doc = Document(file_path)
+        doc = Document(truncate_file)
     except Exception as e:
         print(f"Error opening file: {e}")
         return []
@@ -207,15 +208,51 @@ def read_docx_last_column(file_path):
 
     return last_column_values
 
-#TODO:采购拦标价：人民币 380000.00 元供应商首次报价或最后报价超出本项目公布拦标价的，按照无效报价处
+#完整读取文件中所有表格（适合pdf转docx价格便宜的情况，优先推荐，内容完整）
+def read_tables_from_docx(file_path):
+    # 尝试打开文档
+    try:
+        doc = Document(file_path)
+    except Exception as e:
+        print(f"Error opening file: {e}")
+        return []
+
+    # 初始化列表来保存符合条件的单元格内容
+    cell_contents = []
+
+    # 读取文档中的所有表格
+    if not doc.tables:
+        print("No tables found in the document.")
+        return []
+
+    # 遍历文档中的每个表格
+    for table_idx, table in enumerate(doc.tables):
+        # 遍历表格中的每一行
+        for row_idx, row in enumerate(table.rows):
+            # 遍历每一行中的单元格
+            for cell in row.cells:
+                cell_text = cell.text.strip()  # 去除单元格内容前后空白
+                if len(cell_text) > 6:  # 检查文字数量是否大于5
+                    cell_contents.append(cell_text)
+
+    # 返回符合条件的单元格内容
+    return cell_contents
+
 def extract_table_with_keywords(data, keywords, follow_up_keywords):
     """遍历列表中的每个元素，查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
     sentences1 = []  # 保存没有后续关键词的情况
     sentences2 = []  # 保存有后续关键词的情况
 
+    # 检查是否包含 '无效报价' 的关键词
+    check_invalid_bidding = '无\s*效\s*报\s*价' in keywords
     # 遍历列表中的每个字符串元素
     for item in data:
-        # 分割句子，保证句子完整性（按标点符号和序号分割）
+        # 只有在包含 '无效投标' 关键词时，才检查 "无效报价"
+        if check_invalid_bidding and re.search(r'无\s*效\s*报\s*价', item, re.IGNORECASE):
+            sentences1.append(item.strip())
+            continue
+
+        # 分割句子，保证句子完整性（按标点符号和序号分割）eg:(?=\d+\.\d+)：匹配诸如 1.1、2.2 之类的序号，并在序号前进行分割。
         split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[（(]\d+[)）])', item)
 
         i = 0
@@ -239,19 +276,17 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords):
                         full_text = ' '.join(split_sentences[start_index:end_index]).strip()
                     else:
                         full_text = ' '.join(split_sentences[start_index:]).strip()
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                     full_text = re.sub(pattern, '', full_text)
                     sentences2.append(full_text)  # 存储有后续关键词的情况
                     i = end_index if found_next_section else len(split_sentences)
                 else:
                     # 没有后续关键词的情况
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
-                    #TODO:会删除什么范围的万
+                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                     cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').strip()
-                    # 删除句子中的 "万" 或 "元"
-                    cleaned_sentence = re.sub(r'[万元]', '', cleaned_sentence).strip()
                     sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
-
                     i += 1
             else:
                 i += 1
@@ -303,47 +338,47 @@ def handle_query(file_path, user_query, output_file, result_key, keywords,trunca
     excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注：", "本人保证：","我方"]
     follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
     extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords)   #字典结果
-    # print(extracted_contents)
     all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
     table_data_list=read_docx_last_column(truncate_file)   #从投标人须知前附表中提取信息生成列表data，每个元素为'一行信息'
+    # table_data_list=read_tables_from_docx(file_path)
     all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords,follow_up_keywords)
     qianwen_txt = all_texts1 + all_tables1
     # Proceed only if there is content to write
+    selected_contents = set()  # 使用 set 去重
+
     if qianwen_txt:
         with open(output_file, 'w', encoding='utf-8') as file:
-            # 初始化一个计数器
             counter = 1
             for content in qianwen_txt:
-                file.write("..............."+'\n')
-                # 写入内容前加上序号，后面接一个点和空格，然后是内容
+                file.write("..............." + '\n')
                 file.write(f"{counter}. {content}\n")
-                # 更新计数器，每次循环递增
                 counter += 1
+
         file_id = upload_file(output_file)
         qianwen_ans = qianwen_long(file_id, user_query)
-        selected_contents = set()  # 使用 set 去重
         num_list = process_string_list(qianwen_ans)
         print(num_list)
 
         for index in num_list:
             if index - 1 < len(qianwen_txt):
-                content = qianwen_txt[index - 1]  # 转换序号为索引（假设序号从1开始）
+                content = qianwen_txt[index - 1]
                 selected_contents.add(content)
-                # 将 all_texts2 和 all_tables2 中的内容也添加到 set 中
-        selected_contents.update(all_texts2)
-        selected_contents.update(all_tables2)
-        # 将 set 转换为 list 来返回结果
+
+    # 无论 qianwen_txt 是否为空，都添加 all_texts2 和 all_tables2 的内容
+    selected_contents.update(all_texts2)
+    selected_contents.update(all_tables2)
+
+    # 如果 selected_contents 不为空，则返回结果，否则返回空字符串
+    if selected_contents:
         res = {result_key: list(selected_contents)}
-        # 将结果转换为JSON字符串
-        # os.remove(output_file)  # Remove the file after use
-        # print(f"Deleted temporary file: {output_file}")
     else:
-        res = {result_key: ""}  # Set the response to empty if no contents were extracted
+        res = {result_key: ""}
+
     return res
 
 def combine_find_invalid(file_path, output_dir,truncate_file):
     queries = [
-        (r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|无\s*效\s*响\s*应|无\s*效\s*报\s*价|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
+        (r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
          "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
          os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
         (r'废\s*标',
@@ -381,11 +416,11 @@ def combine_find_invalid(file_path, output_dir,truncate_file):
 if __name__ == '__main__':
     start_time = time.time()
     # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
-    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件(1)_tobidders_notice_part1.docx"
-    clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause磋商文件(1)_tobidders_notice_part2.json"
+    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件（实高电子显示屏）_tobidders_notice_part1.docx"
+    clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause招标文件（实高电子显示屏）_tobidders_notice_part2.json"
     output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid"
     # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
-    doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx'
+    doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件（实高电子显示屏）.docx'
     results = combine_find_invalid(doc_path, output_dir,truncate_file)
     end_time = time.time()
     print("Elapsed time:", str(end_time - start_time))