#flask_app/工程标/截取pdf工程标版.py import regex import os import time from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import clean_page_content from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages,get_invalid_file from flask_app.general.通用功能函数 import get_global_logger def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header, is_secondary_match): pdf_document = PdfReader(pdf_path) exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') def run_extraction(): start_page = None mid_page = None end_page = None chapter_type = None # 用于存储“章”或“部分” combined_mid_pattern= None end_pattern = None for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) if mid_page is not None and exclusion_pattern and regex.search(exclusion_pattern, cleaned_text): continue if start_page is None: match = regex.search(begin_pattern, cleaned_text) if match and i > begin_page: start_page = i matched_text: str = match.group(0) # 获取整个匹配的文本 if '章' in matched_text: chapter_type = '章' elif '部分' in matched_text: chapter_type = '部分' else: chapter_type = None # 未匹配到“章”或“部分” if chapter_type: # 根据 chapter_type 动态生成 end_pattern if not is_secondary_match: end_pattern = regex.compile( rf'^(? mid_page: end_page = i break return start_page, mid_page, end_page # 运行提取 start_page, mid_page, end_page = run_extraction() if start_page is None or end_page is None or mid_page is None: print(f"first: tobidders_notice 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return "", "" path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_table", common_header) path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice", common_header) return path1, path2 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header): # 打开PDF文件 pdf_document = PdfReader(pdf_path) start_page = None end_page = None flag=True exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') # 遍历文档的每一页,查找开始和结束短语的位置 for i in range(len(pdf_document.pages)): page = pdf_document.pages[i] text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) if flag and regex.search(exclusion_pattern, cleaned_text): flag=False continue if regex.search(begin_pattern, cleaned_text) and i >= begin_page: if start_page and (output_suffix == "notice" or output_suffix == "invalid"): pass else: start_page = i if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,cleaned_text): condition = i > start_page if condition: is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页 if is_invalid_condition or output_suffix != "invalid": end_page = i break # 找到结束页后退出循环 # 移除对 extract_pages_twice 的调用 if start_page is None or end_page is None: print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!") return [""] else: return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)] def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index): """ 处理 PDF 文件,作为 truncate_pdf_main 的后备方法。 此函数将在所有模式对都失败后调用。 """ try: pdf_document = PdfReader(pdf_path) if output_suffix == "qualification": print("twice:qualificaiton!") # 动态设置 include_keys include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查","能力","信誉"] # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])' # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) end_pattern_attachment = regex.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$', regex.MULTILINE ) # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等 end_pattern_chapter = regex.compile( r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|' r'\s*评标(?:办法|方法)前附表\s*$|' r'投标人须知', regex.MULTILINE ) start_page = None end_page = None # 从指定的开始页开始遍历 for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index): text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) # 确定起始页,需在last_begin_index之后 if any(key in cleaned_text for key in include_keys): if regex.search(begin_pattern, cleaned_text, regex.MULTILINE): if start_page is None: start_page = i # 确保起始页不小于章节的开始页码 continue # print(f"匹配到附录/附件/附表,设置起始页为: {start_page}") # 确定结束页 - 附录、附件、附表等 if start_page is not None and end_pattern_attachment.search(cleaned_text): # 额外检查当前页面是否不包含任何 include_keys if not any(key in cleaned_text for key in include_keys): if i > start_page: end_page = i # print(f"找到结束页 (附件类): {end_page}") break # 找到结束页后退出循环 else: print(f"当前页面匹配附件结束模式但包含 include_keys,继续查找。页面: {i}") # 确定结束页 - 章节标题、评标附表、投标人须知等 elif start_page is not None and end_pattern_chapter.search(cleaned_text): if i > start_page: end_page = i # print(f"找到结束页 (章节标题等): {end_page}") break # 找到结束页后退出循环 if start_page is None or end_page is None: print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!") return [] else: return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)] else: print(f"{output_suffix} twice: 未定义的输出后缀。") return [] except Exception as e: print(f"Error in extract_pages_twice: {e}") return [] def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,output_suffix="default"): try: # 内嵌的处理单个文件的函数 def process_single_file(file_path, output_folder, selection): try: # 获取起始和通用页眉 common_header, last_begin_index = get_start_and_common_header(file_path, 20) if selection == 1: # Selection 1: 招标公告 pattern_pairs = [ ( regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请' ), regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE ) ), ( regex.compile( r'.*(?