2024-09-04 17:49:05 +08:00
|
|
|
import re
|
2024-08-29 16:37:09 +08:00
|
|
|
|
2024-09-06 15:50:52 +08:00
|
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
def extract_common_header(pdf_path):
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
headers = []
|
|
|
|
num_pages_to_read = 3 # 预读页数
|
|
|
|
|
|
|
|
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
text = page.extract_text()
|
|
|
|
if text: # 确保页面有文本内容
|
|
|
|
first_line = text.strip().split('\n')[0]
|
|
|
|
headers.append(first_line)
|
|
|
|
|
|
|
|
if len(headers) < 2:
|
|
|
|
return "" # 如果没有足够的页来比较,返回空字符串
|
|
|
|
|
|
|
|
# 使用set交集来找出公共部分
|
|
|
|
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
|
|
|
common_header = ' '.join(common_header)
|
|
|
|
return common_header
|
|
|
|
|
|
|
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
|
|
|
|
res=extract_common_header(input_path)
|
2024-09-05 18:00:40 +08:00
|
|
|
print(res)
|