zbparse/flask_app/main/find_tbformat.py

43 lines
1.7 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
import PyPDF2
import re
def extract_contents_with_pages(pdf_path, keyword):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for page_number in range(len(reader.pages)):
page = reader.pages[page_number]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
if keyword.lower() in line.lower():
match = re.search(r"\d+(?=\s*$)", line)
if match:
return int(match.group(0)) # 直接返回整数类型的页码
return None # 如果遍历完所有页面后仍未找到页码返回None
def split_pdf(pdf_path, start_page, output_path):
"""切分PDF文件从start_page到end_page"""
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
end_page = len(reader.pages)
# 确保start_page是整数
start_page = int(start_page)
# 注意页码从0开始因此需要调整页码索引
for i in range(start_page - 1, end_page):
writer.add_page(reader.pages[i])
with open(output_path, "wb") as output_pdf:
writer.write(output_pdf)
# 使用示例
pdf_path = "D:\\项目\\工程招标\\zb1.pdf"
output_path = "D:\\项目\\工程招标\\tb_format.pdf"
keyword = "投标文件格式" # 修改为你想查找的关键字
page_number = extract_contents_with_pages(pdf_path, keyword)
print(page_number)
if page_number is not None:
split_pdf(pdf_path, page_number, output_path)
else:
print("未找到含有关键字的页码")