32 lines
1.3 KiB
Python
32 lines
1.3 KiB
Python
import PyPDF2
|
||
import re # 导入正则表达式库
|
||
|
||
def clean_page_numbers(text):
|
||
# 使用正则表达式删除页码
|
||
# 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129)
|
||
cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码
|
||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
|
||
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
|
||
return cleaned_text
|
||
|
||
def extract_text_by_page(file_path):
|
||
result = ""
|
||
with open(file_path, 'rb') as file:
|
||
reader = PyPDF2.PdfReader(file)
|
||
num_pages = len(reader.pages)
|
||
# print(f"Total pages: {num_pages}")
|
||
for page_num in range(num_pages):
|
||
page = reader.pages[page_num]
|
||
text = page.extract_text()
|
||
if text:
|
||
cleaned_text = clean_page_numbers(text)
|
||
# print(cleaned_text)
|
||
result += cleaned_text
|
||
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
||
else:
|
||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||
return result
|
||
if __name__ == '__main__':
|
||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
|
||
extract_text_by_page(file_path)
|