zbparse/flask_app/main/find_tbformat.py
2024-08-29 16:37:09 +08:00

43 lines
1.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import PyPDF2
import re
def extract_contents_with_pages(pdf_path, keyword):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for page_number in range(len(reader.pages)):
page = reader.pages[page_number]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
if keyword.lower() in line.lower():
match = re.search(r"\d+(?=\s*$)", line)
if match:
return int(match.group(0)) # 直接返回整数类型的页码
return None # 如果遍历完所有页面后仍未找到页码返回None
def split_pdf(pdf_path, start_page, output_path):
"""切分PDF文件从start_page到end_page"""
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
end_page = len(reader.pages)
# 确保start_page是整数
start_page = int(start_page)
# 注意页码从0开始因此需要调整页码索引
for i in range(start_page - 1, end_page):
writer.add_page(reader.pages[i])
with open(output_path, "wb") as output_pdf:
writer.write(output_pdf)
# 使用示例
pdf_path = "D:\\项目\\工程招标\\zb1.pdf"
output_path = "D:\\项目\\工程招标\\tb_format.pdf"
keyword = "投标文件格式" # 修改为你想查找的关键字
page_number = extract_contents_with_pages(pdf_path, keyword)
print(page_number)
if page_number is not None:
split_pdf(pdf_path, page_number, output_path)
else:
print("未找到含有关键字的页码")