12.30 废标项

2024-12-30 10:11:27 +08:00 · 2024-12-30 10:11:27 +08:00 · 8bfbfc4263
commit 8bfbfc4263
parent 70b6a4b999
2 changed files with 193 additions and 19 deletions
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -74,9 +74,12 @@ def read_tables_from_docx(file_path):
 def preprocess_paragraphs(paragraphs):
    processed = []  # 初始化处理后的段落列表
    index = 0
    flag = False  # 初始化标志位
    # 定义两个新的正则表达式模式
    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
    # 定义列表项的模式
    list_item_pattern = re.compile(
        r'^\s*('
@ -91,6 +94,14 @@ def preprocess_paragraphs(paragraphs):
        r')'
    )
    # 新增的正则表达式，用于匹配以数字序号开头的段落
    pattern_numeric_header = re.compile(
        r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)'  # 匹配如 '12.1 内容'
    )
    pattern_numeric_header_fallback = re.compile(
        r'^(\d+\.)\s*(.+)$'  # 匹配如 '12. 内容'
    )
    # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
    def has_long_spaces(text, max_space_count=5):
        return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
@ -101,7 +112,10 @@ def preprocess_paragraphs(paragraphs):
    # 辅助函数：查找上一个非空且非标记的段落
    def find_prev_text(current_index):
        for i in range(current_index - 1, -1, -1):
            try:
                text = paragraphs[i].text.strip()
            except AttributeError:
                continue  # 如果段落对象没有 text 属性，跳过
            if text and not pattern_marker.search(text):
                return text, i
        return '', -1
@ -109,13 +123,21 @@ def preprocess_paragraphs(paragraphs):
    # 辅助函数：查找下一个非空且非标记的段落
    def find_next_text(current_index):
        for i in range(current_index + 1, len(paragraphs)):
            try:
                text = paragraphs[i].text.strip()
            except AttributeError:
                continue  # 如果段落对象没有 text 属性，跳过
            if text and not pattern_marker.search(text):
                return text, i
        return '', -1
    while index < len(paragraphs):
        try:
            current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
        except AttributeError:
            # 如果段落对象没有 text 属性，跳过该段落
            index += 1
            continue
        # 检查当前段落是否为页面标记
        if pattern_marker.search(current_text):
@ -147,7 +169,11 @@ def preprocess_paragraphs(paragraphs):
            # 从当前 index 向下，跳过所有连续的空白段落和标记
            skip_index = index + 1
            while skip_index < len(paragraphs):
                try:
                    skip_text = paragraphs[skip_index].text.strip()
                except AttributeError:
                    skip_index += 1
                    continue  # 如果段落对象没有 text 属性，跳过
                if skip_text == '' or pattern_marker.search(skip_text):
                    skip_index += 1
                else:
@ -161,12 +187,155 @@ def preprocess_paragraphs(paragraphs):
            index += 1
            continue
-        # 如果当前段落不是标记且不匹配排除模式，则添加到处理后的列表中
+        # 检查是否为以数字序号开头的段落
        match = pattern_numeric_header.match(current_text)
        if not match:
            match = pattern_numeric_header_fallback.match(current_text)
        if match:
            # 当前段落以数字序号开头，直接添加到 processed
            processed.append(current_text)
            flag = True  # 设置标志位，准备处理下一个段落
            index += 1
            continue
        else:
            if flag:
                # 当前段落不以数字序号开头，且 flag 为 True
                if not list_item_pattern.match(current_text):
                    if processed:
                        # **新增逻辑开始**
                        next_non_empty_text, next_non_empty_index = find_next_text(index)
                        is_next_numbered = False
                        if next_non_empty_text:
                            is_next_numbered = bool(
                                pattern_numeric_header.match(next_non_empty_text) or
                                pattern_numeric_header_fallback.match(next_non_empty_text)
                            )
                        if is_next_numbered:
                            # 只有在下一个段落以数字序号开头时，才将当前段落追加到上一个段落
                            processed[-1] = processed[-1] + ' ' + current_text
                        else:
                            # 否则，不追加，而是作为新的段落添加
                            processed.append(current_text)
                        # **新增逻辑结束**
                else:
                    # **新增处理：匹配 list_item_pattern 的段落也应被保存**
                    processed.append(current_text)
                # 无论是否追加，都将 flag 重置
                flag = False
                index += 1
                continue
            else:
                # flag 为 False，直接添加到 processed
                processed.append(current_text)
                index += 1
                continue
    return processed
 #老版本
 # def preprocess_paragraphs(paragraphs):
 #     processed = []  # 初始化处理后的段落列表
 #     index = 0
 #     # 定义两个新的正则表达式模式
 #     pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
 #     pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
 #     # 定义列表项的模式
 #     list_item_pattern = re.compile(
 #         r'^\s*('
 #         r'[（\(]\d+[）\)]|'  # 匹配：(1) 或 （1）
 #         r'[A-Za-z]\.\s*|'  # 匹配：A. 或 b.
 #         r'[一二三四五六七八九十]+、|'  # 匹配：一、二、三、
 #         r'第[一二三四五六七八九十百零]+[章节部分节]|'  # 匹配：第x章，第x部分，第x节
 #         r'[A-Za-z]\d+(?:\.\d+)*[\s\.、．)\）]?|'     # 匹配：A1.2 等
 #         r'\d+(?:\.\d+)+[\s\.、．)\）]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|'  # 匹配：数字序号如1.1 1.1.1
 #         r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|'  # 数字后空格，空格后非指定关键字
 #         r'(?=\d+[、.．])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])'  # 数字后直接跟顿号或点号
 #         r')'
 #     )
 #
 #     # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
 #     def has_long_spaces(text, max_space_count=5):
 #         return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
 #
 #     # 正则表达式用于检测页面标记
 #     pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
 #
 #     # 辅助函数：查找上一个非空且非标记的段落
 #     def find_prev_text(current_index):
 #         for i in range(current_index - 1, -1, -1):
 #             text = paragraphs[i].text.strip()
 #             if text and not pattern_marker.search(text):
 #                 return text, i
 #         return '', -1
 #
 #     # 辅助函数：查找下一个非空且非标记的段落
 #     def find_next_text(current_index):
 #         for i in range(current_index + 1, len(paragraphs)):
 #             text = paragraphs[i].text.strip()
 #             if text and not pattern_marker.search(text):
 #                 return text, i
 #         return '', -1
 #
 #     while index < len(paragraphs):
 #         try:
 #             current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
 #         except AttributeError:
 #             # 如果段落对象没有 text 属性，跳过该段落
 #             index += 1
 #             continue
 #
 #         # 检查当前段落是否为页面标记
 #         if pattern_marker.search(current_text):
 #             # 动态查找前一个非空段落
 #             prev_text, prev_index = find_prev_text(index)
 #             # 动态查找后一个非空段落
 #             next_text, next_index = find_next_text(index)
 #
 #             # 应用现有的合并逻辑
 #             if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
 #                 if not prev_text.endswith(('。', '!', '?')):  # '，', ',',  先注释了，如果逗号，可能还没结束。
 #                     # 检查后一个段落是否为列表项
 #                     if not list_item_pattern.match(next_text) and len(prev_text) > 30:
 #                         # 合并前后段落
 #                         merged_text = prev_text + ' ' + next_text  # 为了可读性添加空格
 #                         if prev_index < len(paragraphs):
 #                             # 移除 processed 中的前一个段落
 #                             if processed and processed[-1] == prev_text:
 #                                 processed.pop()
 #                             # 添加合并后的文本
 #                             processed.append(merged_text)
 #
 #                         # 跳过标记以及前后所有空白段落，直到 next_index
 #                         index = next_index + 1
 #                         continue  # 继续下一个循环
 #
 #             # 如果不满足合并条件，跳过标记及其周围的空白段落
 #             # 计算下一个需要处理的索引
 #             # 从当前 index 向下，跳过所有连续的空白段落和标记
 #             skip_index = index + 1
 #             while skip_index < len(paragraphs):
 #                 skip_text = paragraphs[skip_index].text.strip()
 #                 if skip_text == '' or pattern_marker.search(skip_text):
 #                     skip_index += 1
 #                 else:
 #                     break
 #             index = skip_index
 #             continue  # 继续下一个循环
 #
 #         # 检查当前段落是否匹配任一排除模式
 #         if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
 #             # 如果匹配，则跳过当前段落，不添加到processed列表中
 #             index += 1
 #             continue
 #
 #         # 如果当前段落不是标记且不匹配排除模式，则添加到处理后的列表中
 #         processed.append(current_text)
 #         index += 1
 #
 #     return processed
 #如果当前段落有序号，则向下匹配直接遇到相同的序号样式
 #如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
 def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
@ -224,7 +393,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
                    # for i in range(1, 6):     #同级，与matched_patterns = [pattern]重复了，故注释
                    #     # 复制 parts 列表以避免修改原列表
-                    #     new_parts = parts.copy()
+                    #     new_parts = parts.    copy()
                    #     new_parts[-1] = str(int(new_parts[-1]) + i)
                    #     # 使用不同的分隔符
                    #     next_pattern = r'^' + r'\s*[.．]\s*'.join(new_parts)
@ -262,6 +431,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
                    while current_index < len(processed_paragraphs) - 1:
                        current_index += 1
                        next_text = processed_paragraphs[current_index].strip()
                        # 添加对空白行的处理
                        if not next_text:
                            continue  # 跳过空白行，进入下一个循环
                        if not found_next_number:
                            # 修改后的正则，支持 '数字 、' 格式
                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|([（(]\s*\d+\s*[）)])|(\d+\s*、)',
@ -289,7 +461,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
    processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
    index = 0
    while index < len(processed_paragraphs):
        # print(processed_paragraphs[index].strip())
        index = extract_from_text(processed_paragraphs[index].strip(), index)
        # print("--------------")
        index += 1
    return extracted_paragraphs
@ -664,7 +838,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
    print("无效标与废标done...")
    return {"无效标与废标项": combined_dict}
-#TODO:无效标这里提示词修改一下
+#TODO:preprocess测试一下有无问题
 if __name__ == '__main__':
    start_time = time.time()
    # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
@ -672,12 +846,12 @@ if __name__ == '__main__':
    # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
    # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
    # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
-    pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
+    pdf_path = r'C:\Users\Administrator\Desktop\货物标\truncate_all\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）.pdf'
-    output_dir = r"D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\tmp"
+    output_dir = r"C:\Users\Administrator\Desktop\货物标\truncate_all"
-    # invalid_added=insert_mark(pdf_path)
+    # invalid_added = insert_mark(pdf_path)
-    # invalid_added_docx=pdf2docx(invalid_added)
+    # invalid_added_docx = pdf2docx(invalid_added)
-    invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\invalid_added.docx'
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\truncate_all\invalid_added.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))