import re from PyPDF2 import PdfReader def extract_common_header(pdf_path): pdf_document = PdfReader(pdf_path) headers = [] num_pages_to_read = 3 # 预读页数 for i in range(min(num_pages_to_read, len(pdf_document.pages))): page = pdf_document.pages[i] text = page.extract_text() if text: # 确保页面有文本内容 first_line = text.strip().split('\n')[0] headers.append(first_line) if len(headers) < 2: return "" # 如果没有足够的页来比较,返回空字符串 # 使用set交集来找出公共部分 common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]]) common_header = ' '.join(common_header) return common_header input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf" res=extract_common_header(input_path) print(res)