2024-11-01 14:28:10 +08:00
|
|
|
|
import re
|
2024-12-05 15:01:37 +08:00
|
|
|
|
from PyPDF2 import PdfReader
|
2024-11-01 14:28:10 +08:00
|
|
|
|
def extract_common_header(pdf_path):
|
|
|
|
|
|
|
|
|
|
def get_headers(pdf_document, start_page, pages_to_read):
|
|
|
|
|
headers = []
|
|
|
|
|
for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))):
|
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
if text:
|
|
|
|
|
# 只取每页的前三行,去除前后的空白字符
|
|
|
|
|
first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
|
|
|
|
headers.append(first_lines)
|
|
|
|
|
return headers
|
|
|
|
|
|
|
|
|
|
def find_common_headers(headers):
|
|
|
|
|
if not headers:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# 使用 zip 对齐所有页的对应行
|
|
|
|
|
common_headers = []
|
|
|
|
|
for lines in zip(*headers):
|
|
|
|
|
# 检查所有行是否完全相同
|
|
|
|
|
if all(line == lines[0] for line in lines[1:]):
|
|
|
|
|
common_headers.append(lines[0])
|
|
|
|
|
return common_headers
|
|
|
|
|
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
total_pages = len(pdf_document.pages)
|
|
|
|
|
|
|
|
|
|
# 定义两个提取策略
|
|
|
|
|
strategies = []
|
|
|
|
|
if total_pages >= 3:
|
|
|
|
|
# 策略1:中间的3页
|
|
|
|
|
middle_page = total_pages // 2
|
|
|
|
|
start_page = max(0, middle_page - 1)
|
|
|
|
|
strategies.append((start_page, 3))
|
|
|
|
|
elif total_pages == 2:
|
|
|
|
|
# 策略1:2页
|
|
|
|
|
strategies.append((0, 2))
|
|
|
|
|
else:
|
|
|
|
|
# 策略1:1页
|
|
|
|
|
strategies.append((0, 1))
|
|
|
|
|
|
|
|
|
|
# 策略2:前三页
|
|
|
|
|
if total_pages >= 3:
|
|
|
|
|
strategies.append((0, 3))
|
|
|
|
|
elif total_pages == 2:
|
|
|
|
|
strategies.append((0, 2))
|
|
|
|
|
elif total_pages == 1:
|
|
|
|
|
strategies.append((0, 1))
|
|
|
|
|
|
|
|
|
|
common_headers = []
|
|
|
|
|
|
|
|
|
|
for idx, (start, count) in enumerate(strategies):
|
|
|
|
|
headers = get_headers(pdf_document, start, count)
|
|
|
|
|
if len(headers) < 2:
|
|
|
|
|
continue # 需要至少2页来比较
|
|
|
|
|
|
|
|
|
|
current_common = find_common_headers(headers)
|
|
|
|
|
if current_common:
|
|
|
|
|
common_headers = current_common
|
|
|
|
|
break # 找到共同部分后退出
|
|
|
|
|
# 如果没有找到,继续下一个策略
|
|
|
|
|
|
|
|
|
|
return '\n'.join(common_headers)
|
|
|
|
|
|
|
|
|
|
def clean_page_content(text, common_header):
|
|
|
|
|
# 首先删除抬头公共部分
|
|
|
|
|
if common_header: # 确保有公共抬头才进行替换
|
|
|
|
|
for header_line in common_header.split('\n'):
|
|
|
|
|
if header_line.strip(): # 只处理非空行
|
|
|
|
|
# 替换首次出现的完整行
|
|
|
|
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
2024-12-06 14:40:22 +08:00
|
|
|
|
# 删除文本开头的“第x页”格式的页码
|
|
|
|
|
text = re.sub(r'^第\d+页\s*', '', text)
|
2024-11-01 14:28:10 +08:00
|
|
|
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
2024-12-06 14:40:22 +08:00
|
|
|
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
|
|
|
|
text = re.sub(r'^\s*\/\s*(共\s*)?\d{1,3}\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
|
|
|
|
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
2024-12-05 15:01:37 +08:00
|
|
|
|
return text
|
|
|
|
|
|
2024-12-06 09:25:24 +08:00
|
|
|
|
|
|
|
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_scanned_pdf(file_path, max_pages=15):
|
|
|
|
|
"""
|
|
|
|
|
检查 PDF 是否为扫描件(即前 15 页无文本)。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- file_path: PDF 文件路径。
|
|
|
|
|
- max_pages: 最大检查页数,默认为 15 页。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- True: 如果前 15 页都没有文本,认为是扫描件。
|
|
|
|
|
- False: 如果有任何页有文本,认为不是扫描件。
|
|
|
|
|
"""
|
2024-12-05 15:01:37 +08:00
|
|
|
|
with open(file_path, 'rb') as file:
|
|
|
|
|
reader = PdfReader(file)
|
2024-12-06 09:25:24 +08:00
|
|
|
|
for i, page in enumerate(reader.pages):
|
|
|
|
|
if i >= max_pages: # 超过最大检查页数,停止检查
|
|
|
|
|
break
|
2024-12-05 15:01:37 +08:00
|
|
|
|
if page.extract_text().strip(): # 如果有文本
|
|
|
|
|
return False # 不是扫描型
|
2024-12-06 09:25:24 +08:00
|
|
|
|
return True # 前 max_pages 页都没有文本
|
|
|
|
|
|
2024-12-05 15:01:37 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
|
|
|
|
res=is_scanned_pdf(file_path)
|
|
|
|
|
if res:
|
|
|
|
|
print("扫描型")
|
|
|
|
|
else:
|
|
|
|
|
print("普通型")
|