#flask_app/general/截取pdf通用函数.py import concurrent.futures import os import regex from PyPDF2 import PdfReader from flask_app.general.clean_pdf import extract_common_header, clean_page_content def get_start_and_common_header(input_path,end_page): common_header = extract_common_header(input_path) last_begin_index = 0 begin_pattern = regex.compile( r'.*(? end_page: return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) # 检查是否存在"目录" if catalog_pattern.search(cleaned_text): continue # 如果存在目录,跳过当前页面 if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header, last_begin_index return common_header, last_begin_index def check_pdf_pages(pdf_path, mode,logger): try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") if num_pages <= 50: logger.info("PDF页数小于或等于50页,跳过切分逻辑。") if mode==1: return ['', '', '', '', '', '', ''] else: return ['','','',''] # 若页数大于50页,返回None表示继续处理 return None except Exception as e: logger.error(f"无法读取 PDF 页数: {e}") # 返回空列表意味着无法执行后续处理逻辑 if mode == 1: return ['', '', '', '', '', '', ''] else: return ['', '', '', '']