28 lines
913 B
Python

import re
from PyPDF2 import PdfReader
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path)
headers = []
num_pages_to_read = 3 # 预读页数
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
page = pdf_document.pages[i]
text = page.extract_text()
if text: # 确保页面有文本内容
first_line = text.strip().split('\n')[0]
headers.append(first_line)
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 使用set交集来找出公共部分
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
common_header = ' '.join(common_header)
return common_header
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
res=extract_common_header(input_path)
print(res)