import re import os import time from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import clean_page_content, extract_common_header from flask_app.general.merge_pdfs import merge_selected_pdfs_for_engineering import concurrent.futures import logging def get_global_logger(unique_id): if unique_id is None: return logging.getLogger() # 获取默认的日志器 logger = logging.getLogger(unique_id) return logger logger = None def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header): try: # 获取文件基本名称 base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 构建主要输出文件路径 output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") # 读取PDF文件 pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) # 检查起始和结束页码是否有效 if start_page < 0 or end_page >= total_pages or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page}") return "" # 如果 output_suffix 是 'notice',保存 start_page 之前的页面(若遇到'目录',则截取到'目录') if output_suffix == 'notice' and start_page > 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") before_doc = PdfWriter() toc_page = -1 # 查找目录页 for page_num in range(min(start_page, total_pages)): page_text = pdf_document.pages[page_num].extract_text() cleaned_text = clean_page_content(page_text, common_header) if re.search(r'目\s*录', cleaned_text, re.MULTILINE): toc_page = page_num break # 确定截取的页数 pages_to_extract = toc_page + 1 if toc_page != -1 else start_page # 提取页面 for page_num in range(pages_to_extract): before_doc.add_page(pdf_document.pages[page_num]) print(before_pdf_path) with open(before_pdf_path, 'wb') as f_before: before_doc.write(f_before) # print(f"已保存页面从 0 到 {pages_to_extract} 为 {before_pdf_path}") # 提取指定范围的页面 output_doc = PdfWriter() for page_num in range(start_page, end_page + 1): output_doc.add_page(pdf_document.pages[page_num]) # 保存新的PDF文件 with open(output_pdf_path, 'wb') as f_output: output_doc.write(f_output) print(f"{output_suffix} 已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") return output_pdf_path except Exception as e: print(f"Error in save_pages_to_new_pdf: {e}") return "" # 返回空字符串 def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header, is_secondary_match): pdf_document = PdfReader(pdf_path) exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') def run_extraction(): start_page = None mid_page = None end_page = None chapter_type = None # 用于存储“章”或“部分” for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: continue if start_page is None: match = re.search(begin_pattern, cleaned_text) if match and i > begin_page: start_page = i matched_text = match.group(0) # 获取整个匹配的文本 if '章' in matched_text: chapter_type = '章' elif '部分' in matched_text: chapter_type = '部分' else: chapter_type = None # 未匹配到“章”或“部分” if chapter_type: # 根据 chapter_type 动态生成 end_pattern if not is_secondary_match: end_pattern = re.compile( rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|' r'^评标办法前附表|' r'^附录(?:一)?[::]|' r'^附件(?:一)?[::]|' r'^附表(?:一)?[::]', re.MULTILINE ) else: end_pattern = re.compile( rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$' ) # 根据 chapter_type 动态生成 additional_mid_pattern if chapter_type == '章': additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)' elif chapter_type == '部分': additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)' else: additional_mid_pattern = '' # 定义基础的 mid_pattern base_mid_pattern = ( r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文部分)' ) # 合并基础模式和额外模式 if additional_mid_pattern: combined_mid_pattern = re.compile( rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})', re.MULTILINE ) else: combined_mid_pattern = re.compile( rf'{base_mid_pattern}', re.MULTILINE ) # print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern else: # 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern if not is_secondary_match: end_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|' r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line r'^附录(?:一)?[::]|' # Match "附录一" with optional "一" and colon r'^附件(?:一)?[::]|' # Match "附件一" with optional "一" and colon r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon re.MULTILINE ) else: end_pattern = re.compile( rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$' ) # print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern # 定义基础的 mid_pattern base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' combined_mid_pattern = re.compile( rf'{base_mid_pattern}', re.MULTILINE ) continue if start_page is not None and mid_page is None and combined_mid_pattern: if re.search(combined_mid_pattern, cleaned_text): mid_page = i if start_page is not None and mid_page is not None and chapter_type: if re.search(end_pattern, cleaned_text): if i > mid_page: end_page = i break return start_page, mid_page, end_page # 运行提取 start_page, mid_page, end_page = run_extraction() if start_page is None or end_page is None or mid_page is None: print(f"first: tobidders_notice 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return "", "" path1 = save_pages_to_new_pdf(pdf_path, output_folder, "tobidders_notice_table", start_page, mid_page, common_header) path2 = save_pages_to_new_pdf(pdf_path, output_folder, "tobidders_notice", mid_page, end_page, common_header) return path1, path2 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header, is_secondary_match=False): # 打开PDF文件 pdf_document = PdfReader(pdf_path) start_page = None end_page = None exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') # 遍历文档的每一页,查找开始和结束短语的位置 for i in range(len(pdf_document.pages)): page = pdf_document.pages[i] text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) # if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" if re.search(exclusion_pattern, cleaned_text): continue if re.search(begin_pattern, cleaned_text) and i >= begin_page: if start_page and (output_suffix == "notice" or output_suffix == "invalid"): pass else: start_page = i if start_page is not None and re.search(end_pattern, cleaned_text): condition = i > start_page if condition: is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页 if is_invalid_condition or output_suffix != "invalid": end_page = i break # 找到结束页后退出循环 # 移除对 extract_pages_twice 的调用 if start_page is None or end_page is None: print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!") return [""] else: return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)] def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index): """ 处理 PDF 文件,作为 truncate_pdf_main 的后备方法。 此函数将在所有模式对都失败后调用。 """ try: pdf_document = PdfReader(pdf_path) if output_suffix == "qualification": print("twice:qualificaiton!") # 动态设置 include_keys include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查","能力","信誉"] # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])' # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) end_pattern_attachment = re.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$', re.MULTILINE ) # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等 end_pattern_chapter = re.compile( r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|' r'\s*评标(?:办法|方法)前附表\s*$|' r'投标人须知', re.MULTILINE ) start_page = None end_page = None # 从指定的开始页开始遍历 for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index): text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) # 确定起始页,需在last_begin_index之后 if any(key in cleaned_text for key in include_keys): if re.search(begin_pattern, cleaned_text, re.MULTILINE): if start_page is None: start_page = i # 确保起始页不小于章节的开始页码 continue # print(f"匹配到附录/附件/附表,设置起始页为: {start_page}") # 确定结束页 - 附录、附件、附表等 if start_page is not None and end_pattern_attachment.search(cleaned_text): # 额外检查当前页面是否不包含任何 include_keys if not any(key in cleaned_text for key in include_keys): if i > start_page: end_page = i # print(f"找到结束页 (附件类): {end_page}") break # 找到结束页后退出循环 else: print(f"当前页面匹配附件结束模式但包含 include_keys,继续查找。页面: {i}") # 确定结束页 - 章节标题、评标附表、投标人须知等 elif start_page is not None and end_pattern_chapter.search(cleaned_text): if i > start_page: end_page = i # print(f"找到结束页 (章节标题等): {end_page}") break # 找到结束页后退出循环 if start_page is None or end_page is None: print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!") return [] else: return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)] elif output_suffix == "invalid": pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) # 计算总页数的三分之二 total = int(total_pages * 2 / 3) start_page = last_begin_index end_page = min(90, total) return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)] else: print(f"{output_suffix} twice: 未定义的输出后缀。") return [] except Exception as e: print(f"Error in extract_pages_twice: {e}") return [] def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 begin_pattern = re.compile( r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', re.MULTILINE ) pdf_document = PdfReader(input_path) for i, page in enumerate(pdf_document.pages): if i > 25: return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header, last_begin_index return common_header, last_begin_index def truncate_pdf_main(input_path, output_folder, selection): if os.path.isdir(input_path): generated_files = [] for file_name in os.listdir(input_path): if file_name.endswith('.pdf'): file_path = os.path.join(input_path, file_name) files = truncate_pdf_main(file_path, output_folder, selection) if files: generated_files.extend(files) return generated_files elif os.path.isfile(input_path) and input_path.endswith('.pdf'): # base_file_name = os.path.splitext(os.path.basename(input_path))[0] common_header, last_begin_index = get_start_and_common_header(input_path) # print(last_begin_index) if selection == 1: # Selection 1: 投标人须知前附表 pattern_pairs = [ ( re.compile( r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'), re.compile( r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', re.MULTILINE) ), ( re.compile( r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', re.MULTILINE), re.compile( r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', re.MULTILINE) ) ] output_suffix = "tobidders_notice" elif selection == 2: # Selection 2: 评标办法 pattern_pairs = [ ( re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'), # Alternative begin pattern re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+') # Alternative end pattern ), ( re.compile( r'(?