1.13

2025-01-13 11:27:52 +08:00 · 2025-01-13 11:27:52 +08:00 · 1f0d65e904
commit 1f0d65e904
parent a99923bea8
4 changed files with 134 additions and 127 deletions
--- a/flask_app/general/insert_del_pagemark.py
+++ b/flask_app/general/insert_del_pagemark.py
@ -95,11 +95,11 @@ def delete_mark(docx_path):

 if __name__ == '__main__':
    # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
-    # input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
-    # output=insert_mark(input)
-    doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
-    res=delete_mark(doc_path)
-    if res:
-        print(res)
-    else:
-        print("No")
+    input=r'C:\Users\Administrator\Downloads\国网湖北电力荆州供电公司2024年第四次服务授权竞争性谈判采购-采购文件（15DJ04）.pdf'
+    output=insert_mark(input)
+    # doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
+    # res=delete_mark(doc_path)
+    # if res:
+    #     print(res)
+    # else:
+    #     print("No")
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -294,29 +294,107 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                    break
    return start_page, end_page

-def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header):
+
+def generate_end_pattern(extraction_stage, chapter_type=None):
+    """
+    根据提取阶段和章节类型动态生成 end_pattern。
+
+    参数:
+        extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。
+        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。
+
+    返回:
+        regex.Pattern: 编译后的正则表达式模式。
+    """
+    # 定义 end_pattern 模板
+    end_pattern_template_first = (
+        r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
+        r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$'
+        r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$'
+    )
+
+    end_pattern_template_third = (
+        r'第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()（）]+\s*$'
+    )
+
+    # 选择模板并替换 {chapter_type}
+    if extraction_stage == 'third':
+        template = end_pattern_template_third
+    else:
+        template = end_pattern_template_first
+
+    if chapter_type:
+        pattern_str = template.format(chapter_type=chapter_type)
+    else:
+        pattern_str = template.format(chapter_type='章|部分')
+
+    return regex.compile(pattern_str, regex.MULTILINE)
+
+
+def generate_mid_pattern(chapter_type=None):
+    """
+    根据章节类型动态生成 mid_pattern。
+
+    参数:
+        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。
+
+    返回:
+        regex.Pattern: 编译后的正则表达式模式。
+    """
+    # 定义基础的 mid_pattern
+    base_mid_pattern = (
+        r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'
+        r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
+        r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
+    )
+
+    additional_mid_pattern = ''
+    if chapter_type:
+        if chapter_type == '章':
+            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
+        elif chapter_type == '部分':
+            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
+
+    if additional_mid_pattern:
+        combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
+    else:
+        combined_pattern = base_mid_pattern
+
+    return regex.compile(combined_pattern, regex.MULTILINE)
+
+
+
+def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header):
    """
    从PDF文档中提取起始页、中间页和结束页。
    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
    """
    exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
-    def run_extraction(local_begin_pattern, local_end_pattern=None):
+
+    def run_extraction(begin_pattern, extraction_stage='first'):
        """
-        使用提供的 begin 和 end 模式运行提取过程。
-        如果未提供 local_end_pattern，则根据匹配的章节类型动态生成 end_pattern。
+        使用提供的 begin_pattern 运行提取过程。
+        根据 extraction_stage 判断是“第一次提取”还是“第三次提取”，
+        从而选择对应的默认 end_pattern（动态生成）。
+
        参数:
-            local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
-            local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
+            begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式。
+            extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。

        返回:
            tuple: (start_page, mid_page, end_page)
        """
+
+        # ===== 函数内部变量 =====
        start_page = None
        mid_page = None
        end_page = None
        combined_mid_pattern = None  # 中间页的组合模式
+
        pdf_document = PdfReader(pdf_path)
        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
+
+        # ===== 遍历每一页，执行提取逻辑 =====
        for i, page in enumerate(pdf_document.pages):
            text = page.extract_text() or ""
            cleaned_text = clean_page_content(text, common_header)
@ -324,90 +402,41 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
            # 如果已经找到中间页，且当前页匹配排除模式，则跳过
            if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
                continue
-            if begin_page==0 and catalog_pattern.search(cleaned_text):
-                continue  # 如果存在目录，跳过当前页面

-            # 识别起始页
+            # 如果为第 0 页之后的目录，直接跳过
+            if start_page == 0 and catalog_pattern.search(cleaned_text):
+                continue
+
+            # ========== 识别起始页 ==========
            if start_page is None:
-                match = regex.search(local_begin_pattern, cleaned_text)
-                if match and i > begin_page:
+                match = regex.search(begin_pattern, cleaned_text)
+                if match and (start_page is None or i > start_page):
                    start_page = i
-                    matched_text:str = match.group(0)  # 获取整个匹配的文本
-                    # 如果未提供固定的 end_pattern，则根据匹配的章节类型动态生成
-                    if not local_end_pattern:
-                        if '章' in matched_text:
-                            chapter_type = '章'
-                        elif '部分' in matched_text:
-                            chapter_type = '部分'
-                        else:
-                            chapter_type = None  # 未匹配到“章”或“部分”
-
-                        if chapter_type:
-                            # 根据 chapter_type 动态生成 end_pattern
-                            end_pattern_dynamic = regex.compile(
-                                rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
-                                regex.MULTILINE
-                            )
-
-                            # 根据 chapter_type 动态生成 additional_mid_pattern
-                            if chapter_type == '章':
-                                additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
-                            elif chapter_type == '部分':
-                                additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
-                            else:
-                                additional_mid_pattern = ''
-
-                            # 定义基础的 mid_pattern
-                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
-                                               r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
-
-                            # 合并基础模式和额外模式
-                            if additional_mid_pattern:
-                                combined_mid_pattern = regex.compile(
-                                    rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
-                                    regex.MULTILINE
-                                )
-                            else:
-                                combined_mid_pattern = regex.compile(
-                                    rf'{base_mid_pattern}',
-                                    regex.MULTILINE
-                                )
-                        else:
-                            # 如果未匹配到“章”或“部分”，使用默认的 end_pattern 和 mid_pattern
-                            end_pattern_dynamic = regex.compile(
-                                r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
-                                regex.MULTILINE
-                            )
-
-                            # 定义基础的 mid_pattern
-                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
-                            combined_mid_pattern = regex.compile(
-                                rf'{base_mid_pattern}',
-                                regex.MULTILINE
-                            )
+                    matched_text = match.group(0)  # 获取整个匹配的文本
+                    # 根据匹配到的内容识别是“章”还是“部分”，用于后续组合 mid_pattern
+                    if '章' in matched_text:
+                        chapter_type = '章'
+                    elif '部分' in matched_text:
+                        chapter_type = '部分'
                    else:
-                        # 如果提供了固定的 end_pattern，则使用默认的 mid_pattern
-                        base_mid_pattern = r'.*[（(]?\s*[一二12]?[)）]?\s*[、．.]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$'  # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
-                        combined_mid_pattern = regex.compile(
-                            rf'{base_mid_pattern}',
-                            regex.MULTILINE
-                        )
+                        chapter_type = None  # 未匹配到“章”或“部分”
+
+                    # ===== 动态生成 end_pattern_dynamic =====
+                    end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
+                    # ========== 动态生成 mid_pattern ==========
+                    combined_mid_pattern = generate_mid_pattern(chapter_type)
+                    # 找到 start_page 后，继续循环，进入下一页
                    continue

-            # 识别中间页
+            # ========== 识别中间页 ==========
            if start_page is not None and mid_page is None and combined_mid_pattern:
-                # if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
-                #     continue
                if regex.search(combined_mid_pattern, cleaned_text):
                    mid_page = i

-            # 识别结束页
+            # ========== 识别结束页 ==========
            if start_page is not None and mid_page is not None:
-                # 使用提供的 end_pattern 或动态生成的 end_pattern
-                current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
-                if regex.search(current_end_pattern, cleaned_text):
+                # 当前使用的 end_pattern（上面根据 extraction_stage 和 chapter_type 选择好了）
+                if regex.search(end_pattern_dynamic, cleaned_text):
                    if i > mid_page:
                        end_page = i
                        break
@ -423,7 +452,12 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
        return path1, path2

    # 第一次提取尝试，使用初始的 begin_pattern
-    start_page, mid_page, end_page = run_extraction(begin_pattern)
+    begin_pattern = regex.compile(
+        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
+        r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
+        regex.MULTILINE
+    )
+    start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
    # 如果第一次提取成功，保存并返回路径
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)
@ -435,19 +469,13 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)

-    # 如果第二次提取失败，定义新的 begin_pattern 和 end_pattern 进行第三次尝试
+    print("第二次提取tobidders_notice失败，尝试第三次提取!")
    new_begin_pattern = regex.compile(
        r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
        r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
        regex.MULTILINE
    )
-    new_end_pattern = regex.compile(
-        r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
-        regex.MULTILINE
-    )
-    print("第二次提取tobidders_notice失败，尝试第三次提取!")
-    start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
-
+    start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)
    else:
--- a/flask_app/routes/judge_zbfile.py
+++ b/flask_app/routes/judge_zbfile.py
@ -88,14 +88,14 @@ def build_response(judge_result: JudgeResult, logger) -> Any:
            data=''
        )
    elif judge_result == JudgeResult.YES:
-        logger.error("判断是否为招标文件成功！YES")
+        logger.info("判断是否为招标文件成功！YES")
        return create_response_normal(
            message='判断是否为招标文件成功！',
            status='success',
            data='yes'
        )
    elif judge_result == JudgeResult.NO:
-        logger.error("判断是否为招标文件成功！NO")
+        logger.info("判断是否为招标文件成功！NO")
        return create_response_normal(
            message='判断是否为招标文件成功！',
            status='success',
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -248,13 +248,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                    )
                    local_output_suffix = "qualification1"
                elif selection == 4:
-                    begin_pattern = regex.compile(
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
-                        r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
-                        regex.MULTILINE
-                    )
-                    end_pattern = None
-                    local_output_suffix = "tobidders_notice"
+                    path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header)
+                    return [path1 or "", path2 or ""]
                elif selection == 5:
                    begin_pattern = regex.compile(
                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
@ -266,12 +261,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                    )
                    local_output_suffix = "procurement"
                elif selection == 6:
-                    begin_page = last_begin_index
                    invalid_path = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
-                    if invalid_path:
-                        return [invalid_path]
-                    else:
-                        return [""]
+                    return [invalid_path or ""]
                else:
                    print("无效的选择:请选择1-6")
                    return ['']
@ -279,11 +270,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                # 如果传入的 output_suffix 是 'default'，则使用本地生成的 output_suffix
                if output_suffix == "default":
                    output_suffix = local_output_suffix
-
-                if selection==4:
-                    result = extract_pages_tobidders_notice(pdf_path,output_folder,begin_pattern,begin_page,common_header)
-                else:
-                    result = extract_pages(
+                result = extract_pages(
                        pdf_path,
                        output_folder,
                        begin_pattern,
@ -294,20 +281,12 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                    )
                # 根据提取结果以及不同的 output_suffix 进行处理
                if result:
-                    if output_suffix == "tobidders_notice":
-                        # 确保返回的是元组，并将其中的 None 转换为 ""
-                        if isinstance(result, tuple) and len(result) == 2:
-                            path1, path2 = result
-                            return [path1 or "", path2 or ""]
-                        else:
-                            logger.error("Unexpected result format for 'tobidders_notice'")
-                            return ["",""]
-                    elif output_suffix == "qualification1":
+                    if output_suffix == "qualification1":
                        merge_and_cleanup(result, "qualification3")
                        return [result or ""]
                    else:
                        return [result or ""]
-                return [""] * (2 if selection == 4 else 1)
+                return [""]

            except Exception as e:
                logger.error(f"Error in processing file '{input_path}': {e}")
@ -347,10 +326,10 @@ if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp"
+    output_folder = r"C:\Users\Administrator\Desktop\货物标\output4_2"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
    selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)