zbparse/flask_app/main/按页读取pdf.py

32 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import PyPDF2
import re # 导入正则表达式库
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129
cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
return cleaned_text
def extract_text_by_page(file_path):
result = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
# print(cleaned_text)
result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
extract_text_by_page(file_path)