zbparse/flask_app/main/按页读取pdf.py

31 lines
1.3 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
import PyPDF2
import re # 导入正则表达式库
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129
cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
return cleaned_text
def extract_text_by_page(file_path):
result = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile_tobidders_notice_table.pdf"
extract_text_by_page(file_path)