zbparse/flask_app/general/clean_pdf.py
2024-12-05 15:01:37 +08:00

97 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from PyPDF2 import PdfReader
def extract_common_header(pdf_path):
def get_headers(pdf_document, start_page, pages_to_read):
headers = []
for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))):
page = pdf_document.pages[i]
text = page.extract_text() or ""
if text:
# 只取每页的前三行,去除前后的空白字符
first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
headers.append(first_lines)
return headers
def find_common_headers(headers):
if not headers:
return []
# 使用 zip 对齐所有页的对应行
common_headers = []
for lines in zip(*headers):
# 检查所有行是否完全相同
if all(line == lines[0] for line in lines[1:]):
common_headers.append(lines[0])
return common_headers
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
# 定义两个提取策略
strategies = []
if total_pages >= 3:
# 策略1中间的3页
middle_page = total_pages // 2
start_page = max(0, middle_page - 1)
strategies.append((start_page, 3))
elif total_pages == 2:
# 策略12页
strategies.append((0, 2))
else:
# 策略11页
strategies.append((0, 1))
# 策略2前三页
if total_pages >= 3:
strategies.append((0, 3))
elif total_pages == 2:
strategies.append((0, 2))
elif total_pages == 1:
strategies.append((0, 1))
common_headers = []
for idx, (start, count) in enumerate(strategies):
headers = get_headers(pdf_document, start, count)
if len(headers) < 2:
continue # 需要至少2页来比较
current_common = find_common_headers(headers)
if current_common:
common_headers = current_common
break # 找到共同部分后退出
# 如果没有找到,继续下一个策略
return '\n'.join(common_headers)
def clean_page_content(text, common_header):
# 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换
for header_line in common_header.split('\n'):
if header_line.strip(): # 只处理非空行
# 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text
def is_scanned_pdf(file_path):
with open(file_path, 'rb') as file:
reader = PdfReader(file)
for page in reader.pages:
if page.extract_text().strip(): # 如果有文本
return False # 不是扫描型
return True # 全部页面都没有文本
if __name__ == '__main__':
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
res=is_scanned_pdf(file_path)
if res:
print("扫描型")
else:
print("普通型")