import PyPDF2 import re # 导入正则表达式库 def clean_page_numbers(text): # 使用正则表达式删除页码 # 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129) cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码 cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码 cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码 return cleaned_text def extract_text_by_page(file_path): result = "" with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) # print(f"Total pages: {num_pages}") for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() if text: cleaned_text = clean_page_numbers(text) result += cleaned_text # print(f"Page {page_num + 1} Content:\n{cleaned_text}") else: print(f"Page {page_num + 1} is empty or text could not be extracted.") return result if __name__ == '__main__': file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile_tobidders_notice_table.pdf" extract_text_by_page(file_path)