43 lines
1.7 KiB
Python
43 lines
1.7 KiB
Python
import PyPDF2
|
||
import re
|
||
|
||
def extract_contents_with_pages(pdf_path, keyword):
|
||
with open(pdf_path, "rb") as file:
|
||
reader = PyPDF2.PdfReader(file)
|
||
for page_number in range(len(reader.pages)):
|
||
page = reader.pages[page_number]
|
||
text = page.extract_text()
|
||
if text:
|
||
lines = text.split('\n')
|
||
for line in lines:
|
||
if keyword.lower() in line.lower():
|
||
match = re.search(r"\d+(?=\s*$)", line)
|
||
if match:
|
||
return int(match.group(0)) # 直接返回整数类型的页码
|
||
return None # 如果遍历完所有页面后仍未找到页码,返回None
|
||
|
||
def split_pdf(pdf_path, start_page, output_path):
|
||
"""切分PDF文件从start_page到end_page"""
|
||
with open(pdf_path, "rb") as file:
|
||
reader = PyPDF2.PdfReader(file)
|
||
writer = PyPDF2.PdfWriter()
|
||
end_page = len(reader.pages)
|
||
# 确保start_page是整数
|
||
start_page = int(start_page)
|
||
# 注意页码从0开始,因此需要调整页码索引
|
||
for i in range(start_page - 1, end_page):
|
||
writer.add_page(reader.pages[i])
|
||
with open(output_path, "wb") as output_pdf:
|
||
writer.write(output_pdf)
|
||
|
||
# 使用示例
|
||
pdf_path = "D:\\项目\\工程招标\\zb1.pdf"
|
||
output_path = "D:\\项目\\工程招标\\tb_format.pdf"
|
||
keyword = "投标文件格式" # 修改为你想查找的关键字
|
||
page_number = extract_contents_with_pages(pdf_path, keyword)
|
||
print(page_number)
|
||
if page_number is not None:
|
||
split_pdf(pdf_path, page_number, output_path)
|
||
else:
|
||
print("未找到含有关键字的页码")
|