zbparse/flask_app/工程标/截取pdf工程标版.py

# flask_app/工程标/截取pdf工程标版.py
import fitz
import regex
import os
import time
from PyPDF2 import PdfReader
from flask_app.general.读取文件.clean_pdf import clean_page_content,create_get_text_function
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
    extract_pages_tobidders_notice, get_notice, extract_pages_generic
from flask_app.general.通用功能函数 import get_global_logger


def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
    try:
        exclusion_pattern = regex.compile(
            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
        start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
                                                     common_header, exclusion_pattern, output_suffix, invalid_endpage)
        if start_page is not None and end_page is not None:
            return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
        else:
            return ""
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""


def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
    """
    处理 PDF 文件，作为 truncate_pdf_main 的后备方法。
    当所有模式均失败后调用此方法。
    """
    if output_suffix != "qualification":
        return ""
    print("最后一次尝试获取qualification!")
    # 动态设置关键字
    include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力", "信誉"]
    # 定义起始匹配模式，仅匹配“附录”、“附件”或“附表”
    begin_pattern = r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:])'
    # 定义结束匹配模式 - 附录、附件、附表等（移除负向前瞻）
    end_pattern_attachment = regex.compile(
        r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:]).*$',
        regex.MULTILINE
    )
    # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
    end_pattern_chapter = regex.compile(
        r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
        regex.MULTILINE
    )

    def process_pdf(total_pages, get_text, include_keys, begin_pattern, end_pattern_attachment, end_pattern_chapter,
                    common_header, last_begin_index):
        """
        遍历 PDF 的各页，依据匹配规则确定起始页和结束页。
        """
        start_page = None
        end_page = None
        for i in range(last_begin_index, total_pages):
            text = get_text(i)
            if not text:
                continue
            cleaned_text = clean_page_content(text, common_header)

            # 确定起始页：页面包含 include_keys 且匹配 begin_pattern
            if start_page is None and any(key in cleaned_text for key in include_keys):
                if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
                    start_page = i
                    continue

            # 一旦起始页确定，则查找结束页
            if start_page is not None:
                # 结束模式：附件类
                if end_pattern_attachment.search(cleaned_text):
                    # 如果页面不包含 include_keys 且页码大于起始页，则认为找到了结束页
                    if not any(key in cleaned_text for key in include_keys) and i > start_page:
                        end_page = i
                        break
                    else:
                        print(f"当前页面匹配附件结束模式但包含 include_keys，继续查找。页面: {i}")
                # 结束模式：章节标题等
                elif end_pattern_chapter.search(cleaned_text) and i > start_page:
                    end_page = i
                    break
        return start_page, end_page
    # 尝试使用 PyPDF2 方式读取 PDF
    try:
        with open(pdf_path, "rb") as f:
            pdf_document = PdfReader(f)
            get_text = create_get_text_function('pypdf2', pdf_document)
            total_pages = len(pdf_document.pages)
            start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern,
                                               end_pattern_attachment, end_pattern_chapter,
                                               common_header, last_begin_index)
    except Exception as e_pypdf2:
        print(f"extract_pages_twice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
        # 使用 PyMuPDF 作为备用方案
        try:
            with fitz.open(pdf_path) as pdf_document:
                get_text = create_get_text_function('fitz', pdf_document)
                total_pages = pdf_document.page_count
                start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern,
                                                   end_pattern_attachment, end_pattern_chapter,
                                                   common_header, last_begin_index)
        except Exception as e_fitz:
            print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
            return ""

    if start_page is None or end_page is None:
        print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中！")
        return ""
    else:
        return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)


def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
    try:
        # 内嵌的处理单个文件的函数
        def process_single_file(pdf_path, output_folder, selection):
            try:
                # 获取起始和通用页眉
                common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
                begin_page = last_begin_index if last_begin_index != -1 else {
                    1: 0,  # 公告
                    2: 5,  # 评标
                    3: 5,  # 资格
                    4: 0,  # 投标人须知
                    5: 0   # 无效投标
                }.get(selection, 0)
                # print(last_begin_index)
                if selection == 1:
                    # 招标公告
                    path=get_notice(pdf_path,output_folder,begin_page,common_header,invalid_endpage)
                    return [path or ""]
                elif selection == 2:
                    # Selection 2: 评标办法
                    pattern_pairs = [
                        (
                            regex.compile(
                                r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
                                r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
                                r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
                                regex.MULTILINE
                            ),
                            # Alternative end pattern
                            regex.compile(
                                r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
                                r'[：:]清标报告\s*$|'
                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文(?:部分)?\s*$|'
                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
                                regex.MULTILINE
                            )
                        ),
                        (
                            regex.compile(
                                r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*'  # 第一种模式
                                r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()（）]*?)'
                                r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'  # 修改的部分
                                r'[\u4e00-\u9fff、()（）]*\s*$|'
                                r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)(?:前附表)?\s*$)',
                                # 第二种模式
                                regex.MULTILINE
                            ),
                            regex.compile(
                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
                                regex.MULTILINE
                            )
                        )
                    ]
                    output_suffix = "evaluation_method"
                elif selection == 3:
                    # Selection 3: 资格审查条件
                    pattern_pairs = [
                        (
                            regex.compile(
                                r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:]).*(?:资质|资格).*$|'
                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
                                r'[\u4e00-\u9fff、()（）]*资格[\u4e00-\u9fff、()（）]*\s*$',
                                regex.MULTILINE
                            ),
                            regex.compile(
                                r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:])(?!.*(?:资质|资格)).*|'
                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
                                regex.MULTILINE
                            )
                        )
                    ]
                    output_suffix = "qualification"
                elif selection == 4:
                    # 投标人须知
                    path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
                    return [path1 or "", path2 or ""]
                elif selection == 5:    #除去投标文件格式之前的内容 或者 合同条款之前的内容
                    invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
                    return [invalid_path or "", end_page]
                else:
                    print("无效的选择:请选择1-5")
                    return [""]
                for begin_pattern, end_pattern in pattern_pairs:
                    result = extract_pages(
                        pdf_path,
                        output_folder,
                        begin_pattern,
                        begin_page=begin_page,
                        invalid_endpage=invalid_endpage,
                        end_pattern=end_pattern,
                        output_suffix=output_suffix,
                        common_header=common_header
                    )
                    if result:
                        return [result]
                    else:
                        # print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
                        continue
                # 如果所有模式对都失败，尝试回退
                print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。{pdf_path}")
                fallback_result = extract_pages_twice(
                    pdf_path,
                    output_folder,
                    output_suffix,
                    common_header,
                    last_begin_index
                )
                return [fallback_result or ""]

            except Exception as e:
                logger.error(f"Error in processing file '{pdf_path}': {e}")
                return [""] * (2 if selection == 4 else 1)

        # 检查是否为文件夹
        if os.path.isdir(input_path):
            generated_files = []
            for file_name in os.listdir(input_path):
                if file_name.endswith('.pdf'):
                    pdf_path = os.path.join(input_path, file_name)
                    files = process_single_file(pdf_path, output_folder, selection)
                    if isinstance(files, tuple):
                        generated_files.extend([f if f else "" for f in files])
                    else:
                        generated_files.append(files)
            return generated_files
        elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
            return process_single_file(input_path, output_folder, selection)
        else:
            print("提供的路径既不是文件夹也不是PDF文件。")
            return [""] * (2 if selection == 4 else 1)
    except Exception as e:
        print(f"Error in truncate_pdf_main_goods: {e}")
        return [""] * (2 if selection == 4 else 1)


if __name__ == "__main__":
    logger = get_global_logger("123")
    start_time = time.time()
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程.pdf"

    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    output_folder = r"C:\Users\Administrator\Desktop\货物标\output2"
    selection = 2   # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2  5-invalid
    generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
    print(generated_files)
    # print("生成的文件:", generated_files)
    end_time = time.time()
    print("耗时：" + str(end_time - start_time))
-.3 修改bug

											
										
										
											2025-01-03 17:36:23 +08:00
+								# flask_app/工程标/截取pdf工程标版.py
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								import fitz
-.9 截取pdf逻辑优化

											
										
										
											2024-12-10 17:32:08 +08:00
+								import regex
-.29截取工程标新逻辑

											
										
										
											2024-10-29 09:04:23 +08:00
+								import os
-.31

											
										
										
											2024-10-31 15:03:32 +08:00
+								import time
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								from PyPDF2 import PdfReader
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								from flask_app.general.读取文件.clean_pdf import clean_page_content,create_get_text_function
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    extract_pages_tobidders_notice, get_notice, extract_pages_generic
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
+								from flask_app.general.通用功能函数 import get_global_logger
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
-.29截取工程标新逻辑

											
										
										
											2024-10-29 20:40:14 +08:00
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
 								    try:
 								        exclusion_pattern = regex.compile(
 								            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                                     common_header, exclusion_pattern, output_suffix, invalid_endpage)
 								        if start_page is not None and end_page is not None:
 								            return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
 								        else:
 								            return ""
 								    except Exception as e:
 								        print(f"Error processing {pdf_path}: {e}")
 								        return ""
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
+								def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
 								    """
 								    处理 PDF 文件，作为 truncate_pdf_main 的后备方法。
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    当所有模式均失败后调用此方法。
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
+								    """
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    if output_suffix != "qualification":
 								        return ""
 								    print("最后一次尝试获取qualification!")
 								    # 动态设置关键字
 								    include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力", "信誉"]
 								    # 定义起始匹配模式，仅匹配“附录”、“附件”或“附表”
 								    begin_pattern = r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:])'
 								    # 定义结束匹配模式 - 附录、附件、附表等（移除负向前瞻）
 								    end_pattern_attachment = regex.compile(
 								        r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:]).*$',
 								        regex.MULTILINE
 								    )
 								    # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
 								    end_pattern_chapter = regex.compile(
 								        r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
 								        regex.MULTILINE
 								    )
 								    def process_pdf(total_pages, get_text, include_keys, begin_pattern, end_pattern_attachment, end_pattern_chapter,
 								                    common_header, last_begin_index):
 								        """
 								        遍历 PDF 的各页，依据匹配规则确定起始页和结束页。
 								        """
 								        start_page = None
 								        end_page = None
 								        for i in range(last_begin_index, total_pages):
 								            text = get_text(i)
 								            if not text:
 								                continue
 								            cleaned_text = clean_page_content(text, common_header)
 								            # 确定起始页：页面包含 include_keys 且匹配 begin_pattern
 								            if start_page is None and any(key in cleaned_text for key in include_keys):
 								                if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
 								                    start_page = i
 								                    continue
 								            # 一旦起始页确定，则查找结束页
 								            if start_page is not None:
 								                # 结束模式：附件类
 								                if end_pattern_attachment.search(cleaned_text):
 								                    # 如果页面不包含 include_keys 且页码大于起始页，则认为找到了结束页
 								                    if not any(key in cleaned_text for key in include_keys) and i > start_page:
 								                        end_page = i
 								                        break
 								                    else:
 								                        print(f"当前页面匹配附件结束模式但包含 include_keys，继续查找。页面: {i}")
 								                # 结束模式：章节标题等
 								                elif end_pattern_chapter.search(cleaned_text) and i > start_page:
 								                    end_page = i
 								                    break
 								        return start_page, end_page
 								    # 尝试使用 PyPDF2 方式读取 PDF
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
+								    try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        with open(pdf_path, "rb") as f:
 								            pdf_document = PdfReader(f)
 								            get_text = create_get_text_function('pypdf2', pdf_document)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								            total_pages = len(pdf_document.pages)
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern,
 								                                               end_pattern_attachment, end_pattern_chapter,
 								                                               common_header, last_begin_index)
 								    except Exception as e_pypdf2:
 								        print(f"extract_pages_twice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
 								        # 使用 PyMuPDF 作为备用方案
 								        try:
 								            with fitz.open(pdf_path) as pdf_document:
 								                get_text = create_get_text_function('fitz', pdf_document)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								                total_pages = pdf_document.page_count
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern,
 								                                                   end_pattern_attachment, end_pattern_chapter,
 								                                                   common_header, last_begin_index)
 								        except Exception as e_fitz:
 								            print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
 								            return ""
 								    if start_page is None or end_page is None:
 								        print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中！")
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								        return ""
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    else:
 								        return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
-.29截取工程标新逻辑

											
										
										
											2024-10-29 20:40:14 +08:00
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								    try:
 								        # 内嵌的处理单个文件的函数
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								        def process_single_file(pdf_path, output_folder, selection):
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								            try:
 								                # 获取起始和通用页眉
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
-.13

											
										
										
											2025-02-14 11:07:21 +08:00
+								                begin_page = last_begin_index if last_begin_index != -1 else {
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+: 0,  # 公告
 : 5,  # 评标
 : 5,  # 资格
 : 0,  # 投标人须知
 : 0   # 无效投标
 								                }.get(selection, 0)
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								                # print(last_begin_index)
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                if selection == 1:
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                    # 招标公告
 								                    path=get_notice(pdf_path,output_folder,begin_page,common_header,invalid_endpage)
 								                    return [path or ""]
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                elif selection == 2:
 								                    # Selection 2: 评标办法
 								                    pattern_pairs = [
 								                        (
 								                            regex.compile(
-.26 优化了商务要求提取，保证有商务要求

											
										
										
											2024-12-26 17:20:27 +08:00
+								                                r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
-.26 优化了商务要求提取，保证有商务要求

											
										
										
											2024-12-26 17:20:27 +08:00
+								                                r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                regex.MULTILINE
 								                            ),
 								                            # Alternative end pattern
 								                            regex.compile(
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
 								                                r'[：:]清标报告\s*$|'
 								                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文(?:部分)?\s*$|'
 								                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                regex.MULTILINE
 								                            )
 								                        ),
 								                        (
 								                            regex.compile(
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*'  # 第一种模式
 								                                r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()（）]*?)'
-.26 优化了商务要求提取，保证有商务要求

											
										
										
											2024-12-26 17:20:27 +08:00
+								                                r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'  # 修改的部分
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                r'[\u4e00-\u9fff、()（）]*\s*$|'
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 15:53:31 +08:00
+								                                r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)(?:前附表)?\s*$)',
-.3 修改bug

											
										
										
											2025-01-03 17:36:23 +08:00
+								                                # 第二种模式
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                regex.MULTILINE
 								                            ),
 								                            regex.compile(
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                regex.MULTILINE
 								                            )
 								                        )
 								                    ]
 								                    output_suffix = "evaluation_method"
 								                elif selection == 3:
 								                    # Selection 3: 资格审查条件
 								                    pattern_pairs = [
 								                        (
 								                            regex.compile(
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                                r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:]).*(?:资质|资格).*$|'
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                r'[\u4e00-\u9fff、()（）]*资格[\u4e00-\u9fff、()（）]*\s*$',
 								                                regex.MULTILINE
 								                            ),
 								                            regex.compile(
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                                r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:])(?!.*(?:资质|资格)).*|'
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                                regex.MULTILINE
 								                            )
 								                        )
 								                    ]
 								                    output_suffix = "qualification"
 								                elif selection == 4:
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                    # 投标人须知
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								                    path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                    return [path1 or "", path2 or ""]
-.7 添加注释

											
										
										
											2025-02-07 15:27:24 +08:00
+								                elif selection == 5:    #除去投标文件格式之前的内容 或者 合同条款之前的内容
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								                    invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
 								                    return [invalid_path or "", end_page]
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                else:
 								                    print("无效的选择:请选择1-5")
 								                    return [""]
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                for begin_pattern, end_pattern in pattern_pairs:
 								                    result = extract_pages(
 								                        pdf_path,
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                        output_folder,
 								                        begin_pattern,
 								                        begin_page=begin_page,
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								                        invalid_endpage=invalid_endpage,
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                        end_pattern=end_pattern,
 								                        output_suffix=output_suffix,
 								                        common_header=common_header
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
+								                    )
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                    if result:
 								                        return [result]
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                    else:
 								                        # print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
 								                        continue
 								                # 如果所有模式对都失败，尝试回退
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。{pdf_path}")
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                fallback_result = extract_pages_twice(
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                    pdf_path,
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                    output_folder,
 								                    output_suffix,
 								                    common_header,
 								                    last_begin_index
-.29截取工程标新逻辑

											
										
										
											2024-10-29 20:40:14 +08:00
+								                )
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                return [fallback_result or ""]
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
 								            except Exception as e:
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                logger.error(f"Error in processing file '{pdf_path}': {e}")
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								                return [""] * (2 if selection == 4 else 1)
 								        # 检查是否为文件夹
 								        if os.path.isdir(input_path):
 								            generated_files = []
 								            for file_name in os.listdir(input_path):
 								                if file_name.endswith('.pdf'):
-												合并工程标货物标的截取逻辑，优化代码

											
										
										
											2025-01-13 17:14:54 +08:00
+								                    pdf_path = os.path.join(input_path, file_name)
 								                    files = process_single_file(pdf_path, output_folder, selection)
 								                    if isinstance(files, tuple):
 								                        generated_files.extend([f if f else "" for f in files])
 								                    else:
 								                        generated_files.append(files)
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								            return generated_files
 								        elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
 								            return process_single_file(input_path, output_folder, selection)
-.29截取工程标新逻辑

											
										
										
											2024-10-29 20:40:14 +08:00
+								        else:
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								            print("提供的路径既不是文件夹也不是PDF文件。")
 								            return [""] * (2 if selection == 4 else 1)
 								    except Exception as e:
 								        print(f"Error in truncate_pdf_main_goods: {e}")
 								        return [""] * (2 if selection == 4 else 1)
-.29截取工程标新逻辑

											
										
										
											2024-10-29 09:04:23 +08:00
-.3 修改bug

											
										
										
											2025-01-03 17:36:23 +08:00
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								if __name__ == "__main__":
-.3 修改bug

											
										
										
											2025-01-03 17:36:23 +08:00
+								    logger = get_global_logger("123")
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
+								    start_time = time.time()
-.11 解析优化

											
										
										
											2024-12-11 17:42:51 +08:00
+								    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
-.12 使用turbo作为plus超限时的保底选择

											
										
										
											2025-02-12 14:55:35 +08:00
+								    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程.pdf"
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
 								    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
-.11 解析优化

											
										
										
											2024-12-11 17:42:51 +08:00
+								    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
-.12 使用turbo作为plus超限时的保底选择

											
										
										
											2025-02-12 14:55:35 +08:00
+								    output_folder = r"C:\Users\Administrator\Desktop\货物标\output2"
 								    selection = 2   # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2  5-invalid
-.3 修改bug

											
										
										
											2025-01-03 17:36:23 +08:00
+								    generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								    print(generated_files)
-.31

											
										
										
											2024-10-31 20:12:08 +08:00
+								    # print("生成的文件:", generated_files)
-.1适配货物标

											
										
										
											2024-11-01 14:28:10 +08:00
+								    end_time = time.time()
 								    print("耗时：" + str(end_time - start_time))