# flask_app/工程标/截取pdf工程标版.py import fitz import regex import os import time from PyPDF2 import PdfReader from flask_app.general.读取文件.clean_pdf import clean_page_content,create_get_text_function from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \ extract_pages_tobidders_notice, get_notice, extract_pages_generic from flask_app.general.通用功能函数 import get_global_logger def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1): try: exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix, invalid_endpage) if start_page is not None and end_page is not None: return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header) else: return "" except Exception as e: print(f"Error processing {pdf_path}: {e}") return "" def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index): """ 处理 PDF 文件,作为 truncate_pdf_main 的后备方法。 当所有模式均失败后调用此方法。 """ if output_suffix != "qualification": return "" print("最后一次尝试获取qualification!") # 动态设置关键字 include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力", "信誉"] # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])' # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) end_pattern_attachment = regex.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$', regex.MULTILINE ) # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等 end_pattern_chapter = regex.compile( r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', regex.MULTILINE ) def process_pdf(total_pages, get_text, include_keys, begin_pattern, end_pattern_attachment, end_pattern_chapter, common_header, last_begin_index): """ 遍历 PDF 的各页,依据匹配规则确定起始页和结束页。 """ start_page = None end_page = None for i in range(last_begin_index, total_pages): text = get_text(i) if not text: continue cleaned_text = clean_page_content(text, common_header) # 确定起始页:页面包含 include_keys 且匹配 begin_pattern if start_page is None and any(key in cleaned_text for key in include_keys): if regex.search(begin_pattern, cleaned_text, regex.MULTILINE): start_page = i continue # 一旦起始页确定,则查找结束页 if start_page is not None: # 结束模式:附件类 if end_pattern_attachment.search(cleaned_text): # 如果页面不包含 include_keys 且页码大于起始页,则认为找到了结束页 if not any(key in cleaned_text for key in include_keys) and i > start_page: end_page = i break else: print(f"当前页面匹配附件结束模式但包含 include_keys,继续查找。页面: {i}") # 结束模式:章节标题等 elif end_pattern_chapter.search(cleaned_text) and i > start_page: end_page = i break return start_page, end_page # 尝试使用 PyPDF2 方式读取 PDF try: with open(pdf_path, "rb") as f: pdf_document = PdfReader(f) get_text = create_get_text_function('pypdf2', pdf_document) total_pages = len(pdf_document.pages) start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern, end_pattern_attachment, end_pattern_chapter, common_header, last_begin_index) except Exception as e_pypdf2: print(f"extract_pages_twice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") # 使用 PyMuPDF 作为备用方案 try: with fitz.open(pdf_path) as pdf_document: get_text = create_get_text_function('fitz', pdf_document) total_pages = pdf_document.page_count start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern, end_pattern_attachment, end_pattern_chapter, common_header, last_begin_index) except Exception as e_fitz: print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") return "" if start_page is None or end_page is None: print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header) def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1): try: # 内嵌的处理单个文件的函数 def process_single_file(pdf_path, output_folder, selection): try: # 获取起始和通用页眉 common_header, last_begin_index = get_start_and_common_header(pdf_path, 20) begin_page = last_begin_index if last_begin_index != -1 else { 1: 0, # 公告 2: 5, # 评标 3: 5, # 资格 4: 0, # 投标人须知 5: 0 # 无效投标 }.get(selection, 0) # print(last_begin_index) if selection == 1: # 招标公告 path=get_notice(pdf_path,output_folder,begin_page,common_header,invalid_endpage) return [path or ""] elif selection == 2: # Selection 2: 评标办法 pattern_pairs = [ ( regex.compile( r'^第[一二三四五六七八九十]+(?:章|部分)\s*' r'(?