43 lines
1.7 KiB
Python
43 lines
1.7 KiB
Python
|
import PyPDF2
|
|||
|
import re
|
|||
|
|
|||
|
def extract_contents_with_pages(pdf_path, keyword):
|
|||
|
with open(pdf_path, "rb") as file:
|
|||
|
reader = PyPDF2.PdfReader(file)
|
|||
|
for page_number in range(len(reader.pages)):
|
|||
|
page = reader.pages[page_number]
|
|||
|
text = page.extract_text()
|
|||
|
if text:
|
|||
|
lines = text.split('\n')
|
|||
|
for line in lines:
|
|||
|
if keyword.lower() in line.lower():
|
|||
|
match = re.search(r"\d+(?=\s*$)", line)
|
|||
|
if match:
|
|||
|
return int(match.group(0)) # 直接返回整数类型的页码
|
|||
|
return None # 如果遍历完所有页面后仍未找到页码,返回None
|
|||
|
|
|||
|
def split_pdf(pdf_path, start_page, output_path):
|
|||
|
"""切分PDF文件从start_page到end_page"""
|
|||
|
with open(pdf_path, "rb") as file:
|
|||
|
reader = PyPDF2.PdfReader(file)
|
|||
|
writer = PyPDF2.PdfWriter()
|
|||
|
end_page = len(reader.pages)
|
|||
|
# 确保start_page是整数
|
|||
|
start_page = int(start_page)
|
|||
|
# 注意页码从0开始,因此需要调整页码索引
|
|||
|
for i in range(start_page - 1, end_page):
|
|||
|
writer.add_page(reader.pages[i])
|
|||
|
with open(output_path, "wb") as output_pdf:
|
|||
|
writer.write(output_pdf)
|
|||
|
|
|||
|
# 使用示例
|
|||
|
pdf_path = "D:\\项目\\工程招标\\zb1.pdf"
|
|||
|
output_path = "D:\\项目\\工程招标\\tb_format.pdf"
|
|||
|
keyword = "投标文件格式" # 修改为你想查找的关键字
|
|||
|
page_number = extract_contents_with_pages(pdf_path, keyword)
|
|||
|
print(page_number)
|
|||
|
if page_number is not None:
|
|||
|
split_pdf(pdf_path, page_number, output_path)
|
|||
|
else:
|
|||
|
print("未找到含有关键字的页码")
|