From 8bfbfc4263ea7252c88094a1d07013eb5b5e2641 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Mon, 30 Dec 2024 10:11:27 +0800
Subject: [PATCH] =?UTF-8?q?12.30=20=E5=BA=9F=E6=A0=87=E9=A1=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../投标人须知正文条款提取成json文件.py       |   8 +-
 flask_app/general/无效标和废标公共代码.py     | 204 ++++++++++++++++--
 2 files changed, 193 insertions(+), 19 deletions(-)

diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py
index 75a61a0..0b9d6b6 100644
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@@ -113,10 +113,10 @@ def parse_text_by_heading(text):
             current_content.append(line_stripped)
             continue
 
-        # 匹配带数字的标题，例如 '12.1 内容'
-        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
-        if not match:
-            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
+            # 匹配带数字的标题，例如 '12.1 内容'
+            match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
+            if not match:
+                match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
 
         # 检查是否进入或退出特殊章节
         if is_heading(line_stripped):
diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py
index bcd2041..4477c9c 100644
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@@ -74,9 +74,12 @@ def read_tables_from_docx(file_path):
 def preprocess_paragraphs(paragraphs):
     processed = []  # 初始化处理后的段落列表
     index = 0
+    flag = False  # 初始化标志位
+
     # 定义两个新的正则表达式模式
     pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
     pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
+
     # 定义列表项的模式
     list_item_pattern = re.compile(
         r'^\s*('
@@ -84,13 +87,21 @@ def preprocess_paragraphs(paragraphs):
         r'[A-Za-z]\.\s*|'  # 匹配：A. 或 b.
         r'[一二三四五六七八九十]+、|'  # 匹配：一、二、三、
         r'第[一二三四五六七八九十百零]+[章节部分节]|'  # 匹配：第x章，第x部分，第x节
-        r'[A-Za-z]\d+(?:\.\d+)*[\s\.、．)\）]?|'     # 匹配：A1.2 等
+        r'[A-Za-z]\d+(?:\.\d+)*[\s\.、．)\）]?|'  # 匹配：A1.2 等
         r'\d+(?:\.\d+)+[\s\.、．)\）]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|'  # 匹配：数字序号如1.1 1.1.1
         r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|'  # 数字后空格，空格后非指定关键字
         r'(?=\d+[、.．])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])'  # 数字后直接跟顿号或点号
         r')'
     )
 
+    # 新增的正则表达式，用于匹配以数字序号开头的段落
+    pattern_numeric_header = re.compile(
+        r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)'  # 匹配如 '12.1 内容'
+    )
+    pattern_numeric_header_fallback = re.compile(
+        r'^(\d+\.)\s*(.+)$'  # 匹配如 '12. 内容'
+    )
+
     # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
     def has_long_spaces(text, max_space_count=5):
         return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
@@ -101,7 +112,10 @@ def preprocess_paragraphs(paragraphs):
     # 辅助函数：查找上一个非空且非标记的段落
     def find_prev_text(current_index):
         for i in range(current_index - 1, -1, -1):
-            text = paragraphs[i].text.strip()
+            try:
+                text = paragraphs[i].text.strip()
+            except AttributeError:
+                continue  # 如果段落对象没有 text 属性，跳过
             if text and not pattern_marker.search(text):
                 return text, i
         return '', -1
@@ -109,13 +123,21 @@ def preprocess_paragraphs(paragraphs):
     # 辅助函数：查找下一个非空且非标记的段落
     def find_next_text(current_index):
         for i in range(current_index + 1, len(paragraphs)):
-            text = paragraphs[i].text.strip()
+            try:
+                text = paragraphs[i].text.strip()
+            except AttributeError:
+                continue  # 如果段落对象没有 text 属性，跳过
             if text and not pattern_marker.search(text):
                 return text, i
         return '', -1
 
     while index < len(paragraphs):
-        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
+        try:
+            current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
+        except AttributeError:
+            # 如果段落对象没有 text 属性，跳过该段落
+            index += 1
+            continue
 
         # 检查当前段落是否为页面标记
         if pattern_marker.search(current_text):
@@ -147,7 +169,11 @@ def preprocess_paragraphs(paragraphs):
             # 从当前 index 向下，跳过所有连续的空白段落和标记
             skip_index = index + 1
             while skip_index < len(paragraphs):
-                skip_text = paragraphs[skip_index].text.strip()
+                try:
+                    skip_text = paragraphs[skip_index].text.strip()
+                except AttributeError:
+                    skip_index += 1
+                    continue  # 如果段落对象没有 text 属性，跳过
                 if skip_text == '' or pattern_marker.search(skip_text):
                     skip_index += 1
                 else:
@@ -161,12 +187,155 @@ def preprocess_paragraphs(paragraphs):
             index += 1
             continue
 
-        # 如果当前段落不是标记且不匹配排除模式，则添加到处理后的列表中
-        processed.append(current_text)
-        index += 1
+        # 检查是否为以数字序号开头的段落
+        match = pattern_numeric_header.match(current_text)
+        if not match:
+            match = pattern_numeric_header_fallback.match(current_text)
+
+        if match:
+            # 当前段落以数字序号开头，直接添加到 processed
+            processed.append(current_text)
+            flag = True  # 设置标志位，准备处理下一个段落
+            index += 1
+            continue
+        else:
+            if flag:
+                # 当前段落不以数字序号开头，且 flag 为 True
+                if not list_item_pattern.match(current_text):
+                    if processed:
+                        # **新增逻辑开始**
+                        next_non_empty_text, next_non_empty_index = find_next_text(index)
+                        is_next_numbered = False
+                        if next_non_empty_text:
+                            is_next_numbered = bool(
+                                pattern_numeric_header.match(next_non_empty_text) or
+                                pattern_numeric_header_fallback.match(next_non_empty_text)
+                            )
+
+                        if is_next_numbered:
+                            # 只有在下一个段落以数字序号开头时，才将当前段落追加到上一个段落
+                            processed[-1] = processed[-1] + ' ' + current_text
+                        else:
+                            # 否则，不追加，而是作为新的段落添加
+                            processed.append(current_text)
+                        # **新增逻辑结束**
+                else:
+                    # **新增处理：匹配 list_item_pattern 的段落也应被保存**
+                    processed.append(current_text)
+                # 无论是否追加，都将 flag 重置
+                flag = False
+                index += 1
+                continue
+            else:
+                # flag 为 False，直接添加到 processed
+                processed.append(current_text)
+                index += 1
+                continue
 
     return processed
 
+#老版本
+# def preprocess_paragraphs(paragraphs):
+#     processed = []  # 初始化处理后的段落列表
+#     index = 0
+#     # 定义两个新的正则表达式模式
+#     pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
+#     pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
+#     # 定义列表项的模式
+#     list_item_pattern = re.compile(
+#         r'^\s*('
+#         r'[（\(]\d+[）\)]|'  # 匹配：(1) 或 （1）
+#         r'[A-Za-z]\.\s*|'  # 匹配：A. 或 b.
+#         r'[一二三四五六七八九十]+、|'  # 匹配：一、二、三、
+#         r'第[一二三四五六七八九十百零]+[章节部分节]|'  # 匹配：第x章，第x部分，第x节
+#         r'[A-Za-z]\d+(?:\.\d+)*[\s\.、．)\）]?|'     # 匹配：A1.2 等
+#         r'\d+(?:\.\d+)+[\s\.、．)\）]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|'  # 匹配：数字序号如1.1 1.1.1
+#         r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|'  # 数字后空格，空格后非指定关键字
+#         r'(?=\d+[、.．])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])'  # 数字后直接跟顿号或点号
+#         r')'
+#     )
+#
+#     # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
+#     def has_long_spaces(text, max_space_count=5):
+#         return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
+#
+#     # 正则表达式用于检测页面标记
+#     pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
+#
+#     # 辅助函数：查找上一个非空且非标记的段落
+#     def find_prev_text(current_index):
+#         for i in range(current_index - 1, -1, -1):
+#             text = paragraphs[i].text.strip()
+#             if text and not pattern_marker.search(text):
+#                 return text, i
+#         return '', -1
+#
+#     # 辅助函数：查找下一个非空且非标记的段落
+#     def find_next_text(current_index):
+#         for i in range(current_index + 1, len(paragraphs)):
+#             text = paragraphs[i].text.strip()
+#             if text and not pattern_marker.search(text):
+#                 return text, i
+#         return '', -1
+#
+#     while index < len(paragraphs):
+#         try:
+#             current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
+#         except AttributeError:
+#             # 如果段落对象没有 text 属性，跳过该段落
+#             index += 1
+#             continue
+#
+#         # 检查当前段落是否为页面标记
+#         if pattern_marker.search(current_text):
+#             # 动态查找前一个非空段落
+#             prev_text, prev_index = find_prev_text(index)
+#             # 动态查找后一个非空段落
+#             next_text, next_index = find_next_text(index)
+#
+#             # 应用现有的合并逻辑
+#             if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
+#                 if not prev_text.endswith(('。', '!', '?')):  # '，', ',',  先注释了，如果逗号，可能还没结束。
+#                     # 检查后一个段落是否为列表项
+#                     if not list_item_pattern.match(next_text) and len(prev_text) > 30:
+#                         # 合并前后段落
+#                         merged_text = prev_text + ' ' + next_text  # 为了可读性添加空格
+#                         if prev_index < len(paragraphs):
+#                             # 移除 processed 中的前一个段落
+#                             if processed and processed[-1] == prev_text:
+#                                 processed.pop()
+#                             # 添加合并后的文本
+#                             processed.append(merged_text)
+#
+#                         # 跳过标记以及前后所有空白段落，直到 next_index
+#                         index = next_index + 1
+#                         continue  # 继续下一个循环
+#
+#             # 如果不满足合并条件，跳过标记及其周围的空白段落
+#             # 计算下一个需要处理的索引
+#             # 从当前 index 向下，跳过所有连续的空白段落和标记
+#             skip_index = index + 1
+#             while skip_index < len(paragraphs):
+#                 skip_text = paragraphs[skip_index].text.strip()
+#                 if skip_text == '' or pattern_marker.search(skip_text):
+#                     skip_index += 1
+#                 else:
+#                     break
+#             index = skip_index
+#             continue  # 继续下一个循环
+#
+#         # 检查当前段落是否匹配任一排除模式
+#         if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
+#             # 如果匹配，则跳过当前段落，不添加到processed列表中
+#             index += 1
+#             continue
+#
+#         # 如果当前段落不是标记且不匹配排除模式，则添加到处理后的列表中
+#         processed.append(current_text)
+#         index += 1
+#
+#     return processed
+
 #如果当前段落有序号，则向下匹配直接遇到相同的序号样式
 #如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
 def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
@@ -224,7 +393,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
 
                     # for i in range(1, 6):     #同级，与matched_patterns = [pattern]重复了，故注释
                     #     # 复制 parts 列表以避免修改原列表
-                    #     new_parts = parts.copy()
+                    #     new_parts = parts.    copy()
                     #     new_parts[-1] = str(int(new_parts[-1]) + i)
                     #     # 使用不同的分隔符
                     #     next_pattern = r'^' + r'\s*[.．]\s*'.join(new_parts)
@@ -262,6 +431,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
                     while current_index < len(processed_paragraphs) - 1:
                         current_index += 1
                         next_text = processed_paragraphs[current_index].strip()
+                        # 添加对空白行的处理
+                        if not next_text:
+                            continue  # 跳过空白行，进入下一个循环
                         if not found_next_number:
                             # 修改后的正则，支持 '数字 、' 格式
                             next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|([（(]\s*\d+\s*[）)])|(\d+\s*、)',
@@ -289,7 +461,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
     processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
     index = 0
     while index < len(processed_paragraphs):
+        # print(processed_paragraphs[index].strip())
         index = extract_from_text(processed_paragraphs[index].strip(), index)
+        # print("--------------")
         index += 1
 
     return extracted_paragraphs
@@ -664,7 +838,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
     print("无效标与废标done...")
     return {"无效标与废标项": combined_dict}
 
-#TODO:无效标这里提示词修改一下
+#TODO:preprocess测试一下有无问题
 if __name__ == '__main__':
     start_time = time.time()
     # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
@@ -672,12 +846,12 @@ if __name__ == '__main__':
     # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
     # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
     # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
-    pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
+    pdf_path = r'C:\Users\Administrator\Desktop\货物标\truncate_all\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）.pdf'
 
-    output_dir = r"D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\tmp"
-    # invalid_added=insert_mark(pdf_path)
-    # invalid_added_docx=pdf2docx(invalid_added)
-    invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\invalid_added.docx'
+    output_dir = r"C:\Users\Administrator\Desktop\货物标\truncate_all"
+    # invalid_added = insert_mark(pdf_path)
+    # invalid_added_docx = pdf2docx(invalid_added)
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\truncate_all\invalid_added.docx'
     results = combine_find_invalid(invalid_added_docx, output_dir)
     end_time = time.time()
     print("Results:", json.dumps(results, ensure_ascii=False, indent=4))