12.18 截取pdf

2024-12-18 16:01:32 +08:00 · 2024-12-18 16:01:32 +08:00 · ccfbe522f8
commit ccfbe522f8
parent b9c1201ec8
8 changed files with 233 additions and 114 deletions
--- a/flask_app/general/截取pdf_main.py
+++ b/flask_app/general/截取pdf_main.py
@ -24,36 +24,33 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
    返回:
    - list: 截取和合并后的文件路径列表，如果截取或合并失败，则包含空字符串。
    """
-    if selections is None:
-        selections = [1, 2, 3, 4, 5]

-    # 确认模式有效
-    if mode not in ['goods', 'engineering']:
-        raise ValueError("mode 参数必须是 'goods' 或 'engineering'")
    def handle_exception(selection):
        return ["", ""] if selection == 4 else [""]
-    # 设置模式相关的参数和函数
-    if mode == 'goods':
-        logger.info("call 货物标截取pdf")
-        truncate_function = truncate_pdf_main_goods
-        merge_mode = 'goods'
+    modes_config = {
+        "goods": {"selections": [1, 2, 3, 4, 5, 6], "truncate_func": truncate_pdf_main_goods},
+        "engineering": {"selections": [1, 2, 3, 4, 5], "truncate_func": truncate_pdf_main_engineering},
+    }

-        # 根据 'goods' 模式定义异常处理的逻辑
-    else:  # mode == 'engineering'
-        logger.info("call 工程标标截取pdf")
-        truncate_function = truncate_pdf_main_engineering
-        merge_mode = 'engineering'
+    # 验证 mode 是否有效
+    if mode not in modes_config:
+        raise ValueError("mode 参数必须是 'goods' 或 'engineering'")

-    num_selections = len(selections)
-    res = check_pdf_pages(pdf_path, logger)
-    if res is not None:
-        return res  # 返回包含空字符串的列表
+    # 初始化 mode 参数
+    config = modes_config[mode]
+    truncate_function = config["truncate_func"]
+    selections = selections or config["selections"]
+
+    # 检查 PDF 页数逻辑
+    skip, empty_return = check_pdf_pages(pdf_path, mode, logger)
+    if skip:
+        return empty_return

    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]  # 纯文件名
    truncate_files = []

    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
        # 提交所有任务并保持 selection 顺序
        future_to_selection = {
            selection: executor.submit(
@ -73,14 +70,17 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
            future = future_to_selection.get(selection)
            try:
                files = future.result()
-                if files:
+                # 扁平化返回的结果
+                if isinstance(files, list):
                    truncate_files.extend(files)
-                # 无需额外处理，因为 `truncate_function` 已处理空字符串的添加和日志记录
+                elif isinstance(files, str):  # 如果返回单个字符串，直接添加
+                    truncate_files.append(files)
+                else:
+                    logger.warning(f"未知的返回类型: {type(files)}，跳过该结果")
            except Exception as e:
                logger.error(f"Selection {selection} 生成了一个异常: {e}")
                # 根据模式和 selection 添加相应数量的空字符串
                truncate_files.extend(handle_exception(selection))
-
    # 定义合并后的输出路径
    merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")

@ -90,7 +90,7 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
        truncate_files,
        merged_output_path,
        base_file_name,
-        mode=merge_mode
+        mode=mode
    )

    if merged_path:
@ -109,15 +109,17 @@ if __name__ == "__main__":
    logger=get_global_logger("123")
    start_time = time.time()
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
-    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
-
+    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
+    pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
+    # pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件（实高电子显示屏）.pdf'
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
    # selections = [1, 4]  # 仅处理 selection 4、1
-    # selections=[5]
+    selections=[6]
    #engineering
-    files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods')
+    # files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
+    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
    print(files)
    # selection = 1  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
    # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -7,6 +7,7 @@ from PyPDF2 import PdfReader, PdfWriter

 from flask_app.general.clean_pdf import extract_common_header, clean_page_content
 from flask_app.general.format_change import docx2pdf
+import concurrent.futures


 def get_start_and_common_header(input_path,end_page):
@ -34,20 +35,26 @@ def get_start_and_common_header(input_path,end_page):
                return common_header, last_begin_index
    return common_header, last_begin_index

-def check_pdf_pages(pdf_path, logger):
+def check_pdf_pages(pdf_path,mode, logger):
    try:
        reader = PdfReader(pdf_path)
        num_pages = len(reader.pages)
        logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
        if num_pages <= 50:
            logger.info("PDF页数小于或等于50页，跳过切分逻辑。")
-            return ['', '', '', '', '', '', '']
+            if mode=='goods':
+                return True,['', '', '', '', '', '', '','']
+            else:
+                return True,['', '', '', '', '', '', '']
        # 若页数大于50页，返回None表示继续处理
-        return None
+        return False, []
    except Exception as e:
        logger.error(f"无法读取 PDF 页数: {e}")
        # 返回空列表意味着无法执行后续处理逻辑
-        return ['', '', '', '', '', '', '']
+        if mode == 'goods':
+            return True,['', '', '', '', '', '', '', '']
+        else:
+            return True,['', '', '', '', '', '', '']


 def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
@ -108,22 +115,151 @@ def convert_to_pdf(file_path):
        return docx2pdf(file_path)
    return file_path

-def get_invalid_file(file_path,output_folder,common_header):
-    pdf_document = PdfReader(file_path)
-    total_pages = len(pdf_document.pages)
-    begin_pattern=[regex.compile(
-                                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE
-                            ),
+
+def get_invalid_file(file_path, output_folder, common_header,begin_page):
+    """
+    从指定的PDF文件中提取无效部分并保存到输出文件夹中。
+    begin_pattern从前往后匹配  end_pattern从后往前
+    页数小于100不进行begin_pattern，默认start_page=0
+    页数大于200时end_pattern从前往后匹配
+
+
+    Args:
+        file_path (str): 输入的PDF文件路径。
+        output_folder (str): 提取后文件的输出文件夹路径。
+        common_header (str): 公共头部文本，用于清理每页的内容。
+
+    Returns:
+        list: 包含保存的文件路径的列表，如果提取失败则返回包含空字符串的列表。
+    """
+    # 定义开始模式
+    begin_patterns = [
        regex.compile(
-        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
-        regex.MULTILINE
-    )]
-    end_pattern=[regex.compile(
-                                r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
-                                regex.MULTILINE
-                            ),
-        regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', regex.MULTILINE),
-        regex.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*"
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
+            regex.MULTILINE
+        ),
+        regex.compile(
+            r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+            regex.MULTILINE
+        )
    ]

+    # 定义结束模式
+    end_patterns = [
+        regex.compile(
+            r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告',
+            regex.MULTILINE
+        ),
+        regex.compile(
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
+            regex.MULTILINE
+        ),
+        regex.compile(
+            r"\s*(投标文件|响应文件|响应性文件)(?:的)?格式\s*",
+            regex.MULTILINE
+        )
+    ]
+
+    # 定义排除模式
+    exclusion_pattern = regex.compile(
+        r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成',
+        regex.MULTILINE
+    )
+
+    try:
+        # 打开PDF文件
+        pdf_document = PdfReader(file_path)
+        total_pages = len(pdf_document.pages)
+        # print(f"PDF总页数: {total_pages}")
+
+        # 提取并清理每页的文本内容
+        page_texts = []
+        for i in range(total_pages):
+            page = pdf_document.pages[i]
+            text = page.extract_text()
+            cleaned_text = clean_page_content(text, common_header) if text else ""
+            page_texts.append(cleaned_text)
+
+        # 定义查找起始页的函数
+        def find_start_page(begin_page):
+            for pattern in begin_patterns:
+                for i in range(begin_page, min(begin_page + 30, total_pages)):
+                    text = page_texts[i]
+                    if regex.search(exclusion_pattern, text):
+                        continue
+                    if regex.search(pattern, text):
+                        # print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
+                        return i
+                # print(f"未在前30页找到模式: {pattern.pattern}")
+            return 0  # 默认从第一页开始
+
+        # 定义查找结束页的函数
+        def find_end_page():
+            if total_pages > 200:
+                # print("总页数大于200页，结束页从前往后查找，跳过前30页。")
+                for pattern in end_patterns:
+                    for i in range(30, total_pages):
+                        text = page_texts[i]
+                        if regex.search(pattern, text):
+                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
+                            return i
+            else:
+                # print("结束页从后往前查找，确保只剩前30页时停止。")
+                for pattern in end_patterns:
+                    for i in range(total_pages - 1, 29, -1):
+                        text = page_texts[i]
+                        if regex.search(pattern, text):
+                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
+                            return i
+
+            # 如果没有匹配到，设置end_page逻辑
+            if total_pages > 100:
+                print("未找到结束模式，总页数大于100，设置结束页为前100页。"+file_path)
+                return max(100, int(total_pages * 2 / 3))
+            else:
+                print("未找到结束模式，设置结束页为最后一页。"+file_path)
+                return total_pages - 1
+
+        # 根据总页数决定是否查找起始页
+        if total_pages < 100:
+            # print("总页数少于100页，默认起始页为第一页。")
+            start_page = 0
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future_end = executor.submit(find_end_page)
+                end_page = future_end.result()
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future_start = executor.submit(find_start_page,begin_page)
+                future_end = executor.submit(find_end_page)
+                start_page = future_start.result()
+                end_page = future_end.result()
+
+        # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
+
+        # 验证页码范围
+        if start_page > end_page:
+            print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
+            return [""]
+
+        # 调用已实现的保存函数
+        output_path = save_extracted_pages(
+            pdf_path=file_path,
+            output_folder=output_folder,
+            start_page=start_page,
+            end_page=end_page,
+            output_suffix="invalid",
+            common_header=common_header
+        )
+
+        # print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
+        return output_path
+
+    except Exception as e:
+        print(f"处理文件 {file_path} 时发生错误: {e}")
+        return ""
+
+if __name__ == "__main__":
+    file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf'
+    output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp'
+    res=get_invalid_file(file_path,output_folder,"")

--- a/flask_app/old_version/截取文件格式.py
+++ b/flask_app/old_version/截取文件格式.py
--- a/flask_app/routes/工程标解析main.py
+++ b/flask_app/routes/工程标解析main.py
@ -54,7 +54,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
    tobidders_notice_table = truncate_files[3]       #投标人须知前附表
    tobidders_notice=truncate_files[4]   #投标人须知正文

-    invalid_path=truncate_files[5] if truncate_files[5]!="" else pdf_path    #无效标
+    invalid_path=truncate_files[5] if truncate_files[5] != "" else pdf_path    #无效标
    # invalid_docpath = copy_docx(docx_path)  # docx截取无效标部分
    # invalid_docpath=pdf2docx(invalid_path)
    invalid_added_pdf = insert_mark(invalid_path)
--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@ -43,7 +43,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
    truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')  # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文

    # 处理各个部分
-    invalid_path=pdf_path
+    invalid_path=truncate_files[6] if truncate_files[6] != "" else pdf_path    #无效标

    invalid_added_pdf = insert_mark(invalid_path)
    invalid_added_docx = pdf2docx(invalid_added_pdf)  # 有标记的invalid_path
@ -62,7 +62,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
    qualification_path = truncate_files[2]  # 资格审查
    tobidders_notice_path = truncate_files[4]  # 投标人须知正文
    notice_path = truncate_files[0]      #招标公告
-    merged_baseinfo_path = truncate_files[6]  # 合并封面+招标公告+投标人须知前附表+须知正文
+    merged_baseinfo_path = truncate_files[7]  # 合并封面+招标公告+投标人须知前附表+须知正文
    clause_path = convert_clause_to_json(tobidders_notice_path, output_folder)  # 投标人须知正文条款pdf->json

    end_time = time.time()
--- a/flask_app/test_case/test_正则表达式.py
+++ b/flask_app/test_case/test_正则表达式.py
@ -1,10 +1,9 @@
 import regex

 begin_pattern = regex.compile(
-    r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
-    r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$',
-    regex.MULTILINE
-)
+            r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
+            regex.MULTILINE
+        )


 # 测试示例
--- a/flask_app/工程标/截取pdf工程标版.py
+++ b/flask_app/工程标/截取pdf工程标版.py
@ -220,14 +220,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
                return []
            else:
                return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
-        elif output_suffix == "invalid":
-            pdf_document = PdfReader(pdf_path)
-            total_pages = len(pdf_document.pages)
-            # 计算总页数的三分之二
-            total = int(total_pages * 2 / 3)
-            start_page = last_begin_index
-            end_page = min(90, total)
-            return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
        else:
            print(f"{output_suffix} twice: 未定义的输出后缀。")
            return []
@ -346,29 +338,35 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
                    ]
                    output_suffix = "tobidders_notice"
                elif selection == 5:
-                    # Selection 5: 无效标
-                    pattern_pairs = [
-                        (
-                            regex.compile(
-                                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
-                            ),
-                            regex.compile(
-                                r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
-                                regex.MULTILINE
-                            )
-                        ),
-                        (
-                            regex.compile(
-                                r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
-                                regex.MULTILINE
-                            ),
-                            regex.compile(
-                                r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
-                                regex.MULTILINE
-                            )
-                        )
-                    ]
-                    output_suffix = "invalid"
+                    begin_page=last_begin_index
+                    invalid_path = get_invalid_file(file_path,output_folder,common_header,begin_page)
+                    if invalid_path:
+                        return [invalid_path]
+                    else:
+                        return [""]
+                #     # Selection 5: 无效标
+                #     pattern_pairs = [
+                #         (
+                #             regex.compile(
+                #                 r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
+                #             ),
+                #             regex.compile(
+                #                 r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
+                #                 regex.MULTILINE
+                #             )
+                #         ),
+                #         (
+                #             regex.compile(
+                #                 r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+                #                 regex.MULTILINE
+                #             ),
+                #             regex.compile(
+                #                 r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
+                #                 regex.MULTILINE
+                #             )
+                #         )
+                #     ]
+                #     output_suffix = "invalid"
                else:
                    print("无效的选择:请选择1-5")
                    return [""]
@ -383,21 +381,6 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
                        5: 0  # 无效标
                    }.get(selection, 0)

-                    if selection == 5: #无效标
-                        invalid_path = list(
-                            get_invalid_file(
-                                file_path,
-                                output_folder,
-                                begin_pattern,
-                                common_header,
-                            )
-                        )
-                        if invalid_path:
-                            return invalid_path
-                        else:
-                            # print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
-                            continue
-
                    if selection == 4:  # 投标人须知
                        output_paths = list(
                            extract_pages_tobidders_notice(
@ -474,14 +457,14 @@ if __name__ == "__main__":
    logger=get_global_logger("123")
    start_time = time.time()
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
-    pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles"

    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
-    # selection = 1  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
-    # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
-    # print(generated_files)
+    output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp"
+    selection = 5  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
+    generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
+    print(generated_files)
    # print("生成的文件:", generated_files)
    end_time = time.time()
    print("耗时：" + str(end_time - start_time))
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -4,7 +4,7 @@ import os  # 用于文件和文件夹操作
 from flask_app.general.clean_pdf import clean_page_content, extract_common_header
 from flask_app.general.merge_pdfs import merge_and_cleanup
 from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
-    convert_to_pdf
+    convert_to_pdf, get_invalid_file
 from flask_app.general.通用功能函数 import get_global_logger

 def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
@ -473,6 +473,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                    os.makedirs(output_folder)

                # 获取起始和通用页眉
+                pdf_path = convert_to_pdf(input_path)
                common_header, last_begin_index = get_start_and_common_header(input_path, 10)
                begin_page = last_begin_index if last_begin_index != 0 else {
                    4: 1,
@ -529,13 +530,12 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                    )
                    local_output_suffix = "procurement"
                elif selection == 6:
-                    begin_pattern = regex.compile(
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*'
-                    )
-                    end_pattern = regex.compile(
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
-                    )
-                    local_output_suffix = "format"
+                    begin_page = last_begin_index
+                    invalid_path = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
+                    if invalid_path:
+                        return [invalid_path]
+                    else:
+                        return [""]
                else:
                    print("无效的选择:请选择1-6")
                    return ['']
@ -545,7 +545,6 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                    output_suffix = local_output_suffix

                # 将原先的 process_files 逻辑合并到此处
-                pdf_path = convert_to_pdf(input_path)
                result = extract_pages(
                    pdf_path,
                    output_folder,