# flask_app/general/截取pdf通用函数.py import concurrent.futures import os import fitz import regex from PyPDF2 import PdfReader, PdfWriter from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function, \ get_text_pypdf2 from flask_app.general.format_change import docx2pdf import concurrent.futures def get_start_and_common_header(input_path, end_page): """ 提取 PDF 文件的公共页眉和起始页索引。 参数: - input_path: PDF 文件路径。 - end_page: 遍历的结束页码。 返回: - common_header: 公共页眉内容。 - last_begin_index: 起始页的页码索引。 """ try: # 提取公共页眉(假设该函数已定义) common_header = extract_common_header(input_path) except Exception as e: print(f"提取公共页眉时出错: {e}") return "", -1 # 定义匹配模式 begin_pattern = regex.compile( r'.*(? end_page: return -1 # 超出 end_page 范围则返回 -1 text = get_text(i) if text: cleaned_text = clean_page_content(text, common_header) # 如果页面中包含“目录”,则跳过 if catalog_pattern.search(cleaned_text): continue if begin_pattern.search(cleaned_text): return i # 找到匹配的起始页,返回页码(从0开始) return -1 # 未找到匹配页 # 先尝试使用 PyPDF2 读取 PDF try: with open(input_path, "rb") as f: pdf_document = PdfReader(f) total_pages = len(pdf_document.pages) get_text = create_get_text_function('pypdf2', pdf_document) last_begin_index = process_pages(get_text, total_pages) return common_header, last_begin_index except Exception as e_pypdf2: print(f"get_start_and_common_header: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") # 如果 PyPDF2 读取失败,尝试使用 PyMuPDF(fitz) try: with fitz.open(input_path) as pdf_document: total_pages = pdf_document.page_count get_text = create_get_text_function('fitz', pdf_document) last_begin_index = process_pages(get_text, total_pages) return common_header, last_begin_index except Exception as e_fitz: print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") return common_header, -1 def check_pdf_pages(pdf_path, mode, logger): def generate_invalid_return(): # 根据 mode 返回不同数量的空字符串及 pdf_path return (True, ['', '', '', '', '', '', pdf_path, '']) if mode == 'goods' else ( True, ['', '', '', '', '', pdf_path, '']) try: try: # 尝试使用 PyPDF2 读取 PDF(新版可直接传入文件对象) with open(pdf_path, "rb") as f: pdf_document = PdfReader(f) num_pages = len(pdf_document.pages) except Exception as e_pypdf2: logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") # 如果 PyPDF2 读取失败,尝试使用 PyMuPDF 读取 PDF with fitz.open(pdf_path) as pdf_document: num_pages = pdf_document.page_count if num_pages <= 30: logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。") return generate_invalid_return() # 若页数大于30页,返回 False 表示继续处理 return False, [] except Exception as e: logger.error(f"无法读取 PDF 页数: {e}") return generate_invalid_return() def extract_and_save_pages(pdf_document, from_page, to_page, output_path, is_using_pypdf2): """ 抽取 PDF 某个范围 [from_page, to_page] 的页面并保存到 output_path。 根据 is_using_pypdf2 判断使用 PyPDF2 还是 PyMuPDF。 """ if is_using_pypdf2: # 使用 PyPDF2 writer = PdfWriter() for page_num in range(from_page, to_page + 1): try: writer.add_page(pdf_document.pages[page_num]) except Exception as e_page: print(f"添加第 {page_num} 页到 writer 时出错: {e_page}") continue try: with open(output_path, 'wb') as f_out: writer.write(f_out) print(f"已截取并保存页面 {from_page} - {to_page} 到 {output_path}") except Exception as e_write: print(f"保存时出错: {e_write}") return "" return output_path else: # 使用 PyMuPDF output_doc = fitz.open() try: for page_num in range(from_page, to_page + 1): try: output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) except Exception as e_page: print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}") continue output_doc.save(output_path) print(f"已截取并保存页面 {from_page} - {to_page} 到 {output_path}") return output_path except Exception as e_write: print(f"保存时出错: {e_write}") return "" finally: output_doc.close() def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header): # 若 start_page 或 end_page 无效,直接退出 if start_page is None or end_page is None: print("错误: start_page 或 end_page 为 None") return "" def process_document(pdf_document, total_pages, using_pypdf2, get_text): # 检查页面范围 if start_page < 0 or end_page >= total_pages or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page}") return "" base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") # 如果 output_suffix 为 'notice' 并且 start_page > 0,则额外保存“目录前”的页面 if output_suffix == 'notice' and start_page > 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") toc_page = -1 # 先遍历从 0 到 start_page - 1,尝试查找“目录”文字 for page_num in range(min(start_page, total_pages)): try: text = get_text(page_num) cleaned_text = clean_page_content(text, common_header) if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE): toc_page = page_num break except Exception as e_extract: print(f"提取第 {page_num} 页文本时出错: {e_extract}") continue # toc_page 找到则提取到 toc_page,否则提取到 start_page - 1 pages_to_extract = toc_page + 1 if toc_page != -1 else start_page # 如果找到了页面可提取,且 pages_to_extract > 0 if pages_to_extract > 0: extract_and_save_pages(pdf_document, 0, pages_to_extract - 1, before_pdf_path, using_pypdf2) # 最后提取并保存目标页面 return extract_and_save_pages(pdf_document, start_page, end_page, output_pdf_path, using_pypdf2) # 第一阶段:尝试使用 PyPDF2 try: with open(pdf_path, "rb") as pdf_file: # 这里构造 PdfReader,确保上下文结束后 pdf_file 会被关闭 pdf_document = PdfReader(pdf_file) total_pages = len(pdf_document.pages) get_text = create_get_text_function('pypdf2', pdf_document) using_pypdf2 = True # 在该 with 块内调用处理函数 return process_document(pdf_document, total_pages, using_pypdf2, get_text) except Exception as e_pypdf2: print(f"使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") # 第二阶段:PyPDF2 失败后再尝试 PyMuPDF try: with fitz.open(pdf_path) as pdf_document: total_pages = pdf_document.page_count get_text = create_get_text_function('fitz', pdf_document) using_pypdf2 = False # 在该 with 块内调用处理函数 return process_document(pdf_document, total_pages, using_pypdf2, get_text) except Exception as e_fitz: print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") return "" def is_pdf_or_doc(filename): # 判断文件是否为PDF或Word文档 return filename.lower().endswith(('.pdf', '.doc', '.docx')) def get_invalid_file(pdf_path, output_folder, common_header, begin_page): """ 从指定的PDF文件中提取无效部分并保存到输出文件夹中。 begin_pattern匹配开头 end_pattern匹配结尾 页数小于100不进行begin_pattern,默认start_page=0 页数大于200时end_pattern从前往后匹配 Args: pdf_path (str): 输入的PDF文件路径。 output_folder (str): 提取后文件的输出文件夹路径。 common_header (str): 公共头部文本,用于清理每页的内容。 Returns: list: 包含保存的文件路径的列表,如果提取失败则返回包含空字符串的列表。 """ # 定义开始模式 begin_patterns = [ regex.compile( r'.*(? 200: # 当页数较多时,从第31页开始正向查找结束模式 for pattern in end_patterns: # 需提前定义好 for i in range(30, total_pages): text = page_texts[i] # 定义局部的排除模式 exclusion_pat = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE ) if regex.search(exclusion_pat, text): continue if regex.search(pattern, text): end_exclusion_pat = regex.compile(r'第[一二三四五六七八九十百千]+(?:章|部分)\s*“[^“”]*合同[^“”]*”', regex.DOTALL) if regex.search(end_exclusion_pat, text): continue # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i else: # 当页数较少时,从后向前查找(至少保留前30页) for pattern in end_patterns: for i in range(total_pages - 1, 29, -1): text = page_texts[i] if regex.search(pattern, text): # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i # 若未匹配到结束模式,设置默认结束页 if total_pages > 200: print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}") return max(100, int(total_pages * 1 / 2)) elif 100 < total_pages <= 200: print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。 " + pdf_path) return max(100, int(total_pages * 2 / 3)) else: print("未找到结束模式,设置结束页为最后一页。 " + pdf_path) return total_pages - 1 # 根据总页数决定是否并发查找起始页与结束页 if total_pages < 100: start_page_found = 0 with concurrent.futures.ThreadPoolExecutor() as executor: future_end = executor.submit(find_end_page) end_page_found = future_end.result() else: with concurrent.futures.ThreadPoolExecutor() as executor: future_start = executor.submit(find_start_page, begin_page) future_end = executor.submit(find_end_page) start_page_found = future_start.result() end_page_found = future_end.result() # 检查页码范围是否有效 if start_page_found > end_page_found: print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})") return "", 0 # 调用已实现的保存函数,保存提取的页面 output_path = save_extracted_pages( pdf_path=pdf_path, output_folder=output_folder, start_page=start_page_found, end_page=end_page_found, output_suffix="invalid", common_header=common_header ) return output_path, end_page_found except Exception as e: print(f"处理文件 {pdf_path} 时发生错误: {e}") return "", 0 finally: # 确保释放资源(对 fitz 的 pdf_document 需要关闭) if pdf_document and isinstance(pdf_document, fitz.Document): pdf_document.close() def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, output_suffix="normal", invalid_endpage=-1): def process_pages(get_text, total_pages): start_page = None end_page = None flag = True end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages) for i in range(begin_page, end_limit): text = get_text(i) if text: cleaned_text = clean_page_content(text, common_header) if output_suffix == "tobidders_notice2": catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text): flag = False continue if begin_page == 0 and catalog_pattern.search(cleaned_text): continue # 如果存在目录,跳过当前页面 else: # 防止匹配到正文前附表/正文部分 if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text): flag = False continue if start_page is None and regex.search(begin_pattern, cleaned_text): if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): start_page = i continue if start_page is not None: if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: if regex.search(end_pattern, cleaned_text) and i > start_page: end_page = i break else: if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text): end_page = i break return start_page, end_page # 尝试使用 PyPDF2,并使用 with open 打开文件 try: with open(pdf_path, "rb") as f: pdf_document = PdfReader(f) total_pages = len(pdf_document.pages) get_text = create_get_text_function('pypdf2', pdf_document) start_page, end_page = process_pages(get_text, total_pages) return start_page, end_page except Exception as e_pypdf2: print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") # 如果 PyPDF2 失败,尝试使用 PyMuPDF try: with fitz.open(pdf_path) as pdf_document: total_pages = pdf_document.page_count get_text = create_get_text_function('fitz', pdf_document) start_page, end_page = process_pages(get_text, total_pages) return start_page, end_page except Exception as e_fitz: print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") return None, None def generate_mid_pattern(chapter_type=None): """ 根据章节类型动态生成 mid_pattern。 参数: chapter_type (str, optional): 章节类型,如 '章' 或 '部分'。 返回: regex.Pattern: 编译后的正则表达式模式。 """ # 定义基础的 mid_pattern base_mid_pattern = ( r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|' r'(? begin_page: start_page = i matched_text = match.group(0) # 获取整个匹配的文本 # 根据匹配到的内容识别是“章”还是“部分”,用于后续组合 mid_pattern if '章' in matched_text: chapter_type = '章' elif '部分' in matched_text: chapter_type = '部分' else: chapter_type = None # 未匹配到“章”或“部分” # ===== 动态生成 end_pattern_dynamic ===== end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type) # ========== 动态生成 mid_pattern ========== combined_mid_pattern = generate_mid_pattern(chapter_type) # 找到 start_page 后,继续循环,进入下一页 continue # ========== 识别中间页 ========== if start_page is not None and mid_page is None and combined_mid_pattern: if regex.search(combined_mid_pattern, cleaned_text): mid_page = i # ========== 识别结束页 ========== if start_page is not None and mid_page is not None: # 当前使用的 end_pattern(上面根据 extraction_stage 和 chapter_type 选择好了) if regex.search(end_pattern_dynamic, cleaned_text): if i > mid_page: end_page = i break return start_page, mid_page, end_page def perform_extraction_and_save(start_page, mid_page, end_page): path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_part1", common_header) if mid_page != end_page: path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice_part2", common_header) else: path2 = path1 return path1, path2 def process_extraction(get_text, total_pages): # 第一次提取尝试,使用初始的 begin_pattern begin_pattern = regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|' r'(? begin_page) begin_page2 = end_page1 - 1 start_page2, end_page2 = extract_pages_generic( pdf_path, second_begin_pattern, end_pattern, begin_page2, common_header, exclusion_pattern, output_suffix ) if start_page2 is None or end_page2 is None: return start_page1, end_page1, end_page1, end_page1 return start_page1, end_page1, start_page2, end_page2 except Exception as e: print(f"读取 PDF 文件失败: {e}") return None, None, None, None def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage): begin_pattern = regex.compile( r'.*(?