From 800326d827ab3834b2522ef8425e3849b41a8048 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Wed, 22 Jan 2025 14:18:28 +0800
Subject: [PATCH] =?UTF-8?q?1.22=20=E8=A7=A3=E5=86=B3=E4=BA=86=E6=97=A0?=
 =?UTF-8?q?=E6=95=88=E6=A0=87=E5=BA=9F=E6=A0=87txt=E5=92=8Call=5Ftext1?=
 =?UTF-8?q?=E7=94=B1=E4=BA=8E=E5=8E=BB=E9=87=8D=E8=80=8C=E4=B8=8D=E4=B8=80?=
 =?UTF-8?q?=E8=87=B4=E7=9A=84bug=20=E8=A7=A3=E5=86=B3=E4=BA=86process=5Fst?=
 =?UTF-8?q?ring=5Flist=E5=8F=AF=E8=83=BD=E8=BF=94=E5=9B=9E=E5=AD=97?=
 =?UTF-8?q?=E7=AC=A6=E4=B8=B2=E5=88=97=E8=A1=A8=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/无效标和废标公共代码.py | 132 ++++++++++++----------
 flask_app/general/通用功能函数.py         |  42 +++----
 2 files changed, 93 insertions(+), 81 deletions(-)

diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py
index 81b59f8..c8c0cf1 100644
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@@ -401,7 +401,8 @@ def split_cell_text(text):
         r'(?<![+\-×÷*/.\．A-Za-z]\s*|\d)(?=\d+[、.．](?!\d|(?:\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家])))|'  # 数字后直接跟顿号、点号时分割，且点号后不跟数字  eg:'1.'
         r'(?<![A-Za-z])(?=[A-Za-z][.．]\s*(?![A-Za-z]))|'  # 单个字母+点号或单个字母+数字，排除www.baidu.com 网址情况 
         r'(?=[A-Za-z]+\s*\d+\s*(?:[.．]\s*\d+)*)|'  # 在字母加数字或多级编号前分割
-        r'(?<=^|\n)(?=[一二三四五六七八九十]+、)',  # 在中文数字加顿号（如一、二、）前分割，(?<=^|\n)增加了行首限制
+        r'(?<=^|\n)(?=[一二三四五六七八九十]+、)|'  # 在中文数字加顿号（如一、二、）前分割，(?<=^|\n)增加了行首限制
+        r'(?<=^|\n)(?=[①②③④⑤⑥⑦⑧⑨]+)',
         item_with_placeholders
     )
 
@@ -431,49 +432,62 @@ def extract_file_elements(file_path):
     for element in doc_elements:
         # 如果是段落
         if element.tag.endswith('}p'):
-            if pre_table_head:
-                text = doc_paragraphs[paragraph_index].text
-                # 如果上一个是表格，并且之后没有文本或为跨页标记，则不提取
-                if (text == '' or pattern_marker.search(text)):
+            try:
+                # Ensure we only process paragraphs if paragraph_index is within range
+                if paragraph_index < len(doc_paragraphs):
+                    # print(f"Processing paragraph {paragraph_index}/{len(doc_paragraphs)}")
+                    text = doc_paragraphs[paragraph_index].text
+                    if pre_table_head:
+                        # If we are still processing a table, check conditions
+                        if (text == '' or pattern_marker.search(text)):
+                            paragraph_index += 1
+                            continue
+                        else:
+                            doc_contents.append('[$$table_over$$]')
+                            table_combine = False
+                            pre_table_head = None
+                    doc_contents.append(doc_paragraphs[paragraph_index])
                     paragraph_index += 1
-                    continue
-                # 如果遇到有效文本，则说明表格提取完毕
                 else:
-                    doc_contents.append('[$$table_over$$]')
-                    table_combine = False
-                    pre_table_head = None
-            doc_contents.append(doc_paragraphs[paragraph_index])
-            paragraph_index += 1
+                    raise IndexError(f"Paragraph index {paragraph_index} is out of range. Total paragraphs: {len(doc_paragraphs)}")
+            except IndexError as e:
+                print(f"Error processing paragraph: {e}")
+                continue  # Skip to the next element if an error occurs
+
         # 如果是表格
         elif element.tag.endswith('}tbl'):
-            table = doc_tables[tables_index]
-            table_content = []
-            for row_idx, row in enumerate(table.rows):
-                if row_idx == 0:
-                    # 跳过表头
-                    if pre_table_head:
-                        table_combine = True
-                        if pre_table_head == row.cells[0].text:
-                            continue
-                    # 记录初始表头
-                    else:
-                        pre_table_head = row.cells[0].text
-                        doc_contents.append('[$$table_start$$]')
-                        continue
-                # 遍历每一行中的单元格
-                for cell in row.cells:
-                    cell_text = cell.text.strip()  # 去除单元格内容前后空白
-                    if len(cell_text) > 8:  # 检查文字数量是否大于8
-                        cell_text = split_cell_text(cell_text)
-                        table_content += cell_text
-            # 合并跨页表格
-            if table_combine:
-                if not doc_contents[-1].endswith(('。', '!', '?', ';')):
-                    doc_contents[-1] += ' ' + table_content[0]
-                    table_content.pop(0)
-            doc_contents.extend(table_content)
-            # doc_contents.append('[$$table_over$$]')
-            tables_index += 1
+            try:
+                if tables_index < len(doc_tables):
+                    # print(f"Processing table {tables_index}/{len(doc_tables)}")
+                    table = doc_tables[tables_index]
+                    table_content = []
+                    for row_idx, row in enumerate(table.rows):
+                        if row_idx == 0:
+                            if pre_table_head:
+                                table_combine = True
+                                if pre_table_head == row.cells[0].text:
+                                    continue
+                            else:
+                                pre_table_head = row.cells[0].text
+                                doc_contents.append('[$$table_start$$]')
+                                continue
+                        for cell in row.cells:
+                            cell_text = cell.text.strip()
+                            if len(cell_text) > 8:
+                                cell_text = split_cell_text(cell_text)
+                                table_content += cell_text
+                    if table_combine:
+                        if doc_contents and table_content and not doc_contents[-1].endswith(('。', '!', '?', ';')):
+                            doc_contents[-1] += ' ' + table_content[0]
+                            table_content.pop(0)
+                    doc_contents.extend(table_content)
+                    tables_index += 1
+                else:
+                    raise IndexError(f"Table index {tables_index} is out of range. Total tables: {len(doc_tables)}")
+            except IndexError as e:
+                print(f"Error processing table: {e}")
+                continue  # Skip to the next element if an error occurs
+
     return doc_contents
 
 def handle_query(file_path, user_query, output_file, result_key, keywords):
@@ -493,26 +507,27 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
         extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
         all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
         # print(all_texts2)
-
+        seen_contents = set()  # 使用集合来跟踪已出现的内容
+        unique_all_texts1 = {}  # 存储去重后的内容
+        for key, content in all_texts1.items():
+            content_key = content[:25]  # 提取前25个字符作为去重依据
+            if content_key not in seen_contents:
+                unique_all_texts1[key] = content  # 保留第一次出现的内容
+                seen_contents.add(content_key)
         # 1. 得到有序的 all_text1_items
-        all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
+        all_text1_items = sorted(unique_all_texts1.items(), key=lambda x: x[0])
         # 2. 得到纯内容列表
         all_texts1_list = [content for (_, content) in all_text1_items]
         # Proceed only if there is content to write
         selected_contents = {}
         final_list=[f"未解析到'{result_key}'！"]
-        seen_contents = set()  # 使用集合跟踪已添加的内容以去重
         if all_texts1_list or all_texts2:
             with open(output_file, 'w', encoding='utf-8') as file:
                 counter = 1
                 for content in all_texts1_list:
-                    # 使用内容的前25个字符作为去重的依据
-                    key = content[:25]  # 提取前25个字符
-                    if key not in seen_contents:  # 如果前30个字符未出现过
-                        file.write(f"{counter}. {content}\n")
-                        file.write("..............." + '\n')
-                        seen_contents.add(key)  # 标记前30个字符为已写入
-                        counter += 1
+                    file.write(f"{counter}. {content}\n")
+                    file.write("..............." + '\n')
+                    counter += 1
 
             # 生成用户查询
             if not all_texts1_list:
@@ -525,7 +540,8 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
                 # model_ans = qianwen_long(file_id, user_query)
                 num_list = process_string_list(model_ans)  # 处理模型返回的序号
                 print(result_key + "选中的序号:" + str(num_list))
-
+            # print(all_texts1_list)
+            # print(all_text1_items)
             for index in num_list:
                 if 1 <= index <= len(all_texts1_list):
                     original_global_idx = all_text1_items[index - 1][0]
@@ -566,9 +582,11 @@ def combine_find_invalid(invalid_docpath, output_dir):
 任务目标：
 从文本中筛选所有描述否决投标，拒绝投标，投标、响应无效或类似表述的情况，并返回对应的序号。
 要求与指南：
-    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
+    1.投标相关主体：包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
+    1.文本中可能存在无关的信息，请准确筛选符合条件的信息，即怎样的情况下,投标相关主体的投标将被否决、拒绝，作为无效标，或者是投标无效、响应无效等，请返回符合条件的信息的序号。
+    2.若条款内容包含'否决投标的情况说明'这样的笼统描述，而未说明具体的情形，则无需添返回该条款。
 输出格式：
-    以 [x, x, x] 的形式返回，x 为符合条件的信息的序号，为自然数。
+    以 [x, x, x] 的形式返回，x 为符合条件的信息的序号，为自然数。无需额外返回解释与说明。
     如果文本中没有符合条件的信息，请返回 []。
 特殊情况：
     如果某序号的内容明显分为几部分且一部分内容符合筛选条件，但其他部分明显是无关内容，请返回符合部分的字符串内容代替序号。
@@ -587,7 +605,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
 要求与指南：
     文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
 输出格式：
-    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的信息的序号，为自然数。
+    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的信息的序号，为自然数。无需额外返回解释与说明。
     如果文本中没有任何符合条件的废标情况，请返回 []。
 示例输出,仅供格式参考：
     [1,3,4,6]
@@ -611,7 +629,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
     若在语境中其指代或包含投标相关主体，则应将其考虑在内；否则，排除该条款。
 
 **输出格式**：
-    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的条款的序号，为自然数。
+    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的条款的序号，为自然数。无需额外返回解释与说明。
     如果没有符合条件的条款，返回 `[]`。
 **示例**：
 - **符合条件**：
@@ -661,10 +679,10 @@ if __name__ == '__main__':
     # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
     pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件（N510113202400010820250102001）.pdf'
 
-    output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
+    output_dir = r"D:\flask_project\flask_app\static\output\output1\63504625-9178-47df-b513-17709434fa68\tmp"
     # invalid_added = insert_mark(pdf_path)
     # # invalid_added_docx = pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
+    invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\63504625-9178-47df-b513-17709434fa68\invalid_added.docx'
     results = combine_find_invalid(invalid_added_docx, output_dir)
     end_time = time.time()
     print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
diff --git a/flask_app/general/通用功能函数.py b/flask_app/general/通用功能函数.py
index f141879..f7888bd 100644
--- a/flask_app/general/通用功能函数.py
+++ b/flask_app/general/通用功能函数.py
@@ -217,34 +217,28 @@ def judge_consortium_bidding(baseinfo_list):
     baseinfo_list[:] = updated_list
     return accept_bidding
 
-#字符串列表转为普通列表，从qianwen回答中提取
+#字符串列表转为普通列表，从大模型回答中提取
 def process_string_list(string_list):
     # 使用正则表达式匹配方括号内的内容
-    match = re.search(r'\[(.*?)\]', string_list)
-    if match:
-        # 获取匹配的内容，即方括号内的部分
-        content_inside_brackets = match.group(1)
-        if content_inside_brackets:  # 检查内容是否为空
-            # 检查内容是否是数字列表
-            if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
-                # 如果是数字，不用加引号，直接保留数字
-                formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
+    try:
+        match = re.search(r'\[(.*?)\]', string_list)
+        if match:
+            # 获取匹配的内容，即方括号内的部分
+            content_inside_brackets = match.group(1)
+            if content_inside_brackets:  # 检查内容是否为空
+                # 提取所有数字项，并转换为整数
+                numbers = [
+                    int(item.strip()) for item in content_inside_brackets.split(',')
+                    if re.match(r'^\d+$', item.strip())  # 正则表达式判断是否为纯数字
+                ]
+                return numbers
             else:
-                # 如果不全是数字，按字符串处理
-                formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
+                return []  # 如果内容为空，直接返回空列表
         else:
-            return []  # 直接返回空列表如果内容为空
-
-        # 使用 ast.literal_eval 来解析格式化后的字符串
-        try:
-            actual_list = ast.literal_eval(formatted_list)
-            return actual_list
-        except SyntaxError as e:
-            print(f"禁止投标情形: Error parsing list: {e}")
-            return []
-    else:
-        # 如果没有匹配到内容，返回空列表
-        return []
+            return []  # 如果没有匹配到内容，返回空列表
+    except Exception as e:
+        print(f"Error occurred: {e}")
+        return []  # 出现任何异常时返回空列表
 
 def get_global_logger(unique_id):
     if unique_id is None: