31 lines
1.3 KiB
Python
31 lines
1.3 KiB
Python
|
import PyPDF2
|
|||
|
import re # 导入正则表达式库
|
|||
|
|
|||
|
def clean_page_numbers(text):
|
|||
|
# 使用正则表达式删除页码
|
|||
|
# 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129)
|
|||
|
cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码
|
|||
|
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
|
|||
|
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
|
|||
|
return cleaned_text
|
|||
|
|
|||
|
def extract_text_by_page(file_path):
|
|||
|
result = ""
|
|||
|
with open(file_path, 'rb') as file:
|
|||
|
reader = PyPDF2.PdfReader(file)
|
|||
|
num_pages = len(reader.pages)
|
|||
|
# print(f"Total pages: {num_pages}")
|
|||
|
for page_num in range(num_pages):
|
|||
|
page = reader.pages[page_num]
|
|||
|
text = page.extract_text()
|
|||
|
if text:
|
|||
|
cleaned_text = clean_page_numbers(text)
|
|||
|
result += cleaned_text
|
|||
|
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
|||
|
else:
|
|||
|
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
|||
|
return result
|
|||
|
if __name__ == '__main__':
|
|||
|
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile_tobidders_notice_table.pdf"
|
|||
|
extract_text_by_page(file_path)
|