import PyPDF2 import re def extract_contents_with_pages(pdf_path, keyword): with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) for page_number in range(len(reader.pages)): page = reader.pages[page_number] text = page.extract_text() if text: lines = text.split('\n') for line in lines: if keyword.lower() in line.lower(): match = re.search(r"\d+(?=\s*$)", line) if match: return int(match.group(0)) # 直接返回整数类型的页码 return None # 如果遍历完所有页面后仍未找到页码,返回None def split_pdf(pdf_path, start_page, output_path): """切分PDF文件从start_page到end_page""" with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) writer = PyPDF2.PdfWriter() end_page = len(reader.pages) # 确保start_page是整数 start_page = int(start_page) # 注意页码从0开始,因此需要调整页码索引 for i in range(start_page - 1, end_page): writer.add_page(reader.pages[i]) with open(output_path, "wb") as output_pdf: writer.write(output_pdf) # 使用示例 pdf_path = "D:\\项目\\工程招标\\zb1.pdf" output_path = "D:\\项目\\工程招标\\tb_format.pdf" keyword = "投标文件格式" # 修改为你想查找的关键字 page_number = extract_contents_with_pages(pdf_path, keyword) print(page_number) if page_number is not None: split_pdf(pdf_path, page_number, output_path) else: print("未找到含有关键字的页码")