2024-11-01 14:28:10 +08:00
|
|
|
|
import re
|
2025-02-13 15:42:52 +08:00
|
|
|
|
import threading
|
2025-01-21 14:21:49 +08:00
|
|
|
|
import fitz
|
2024-12-05 15:01:37 +08:00
|
|
|
|
from PyPDF2 import PdfReader
|
2025-02-13 15:42:52 +08:00
|
|
|
|
from docx import Document
|
2025-02-14 15:46:07 +08:00
|
|
|
|
import queue
|
|
|
|
|
from queue import Queue, Empty
|
2025-02-16 18:09:45 +08:00
|
|
|
|
|
|
|
|
|
# 定义一个函数用于提取文本
|
|
|
|
|
def get_text_pypdf2(pdf_doc, page_num):
|
|
|
|
|
return pdf_doc.pages[page_num].extract_text() or ""
|
|
|
|
|
|
|
|
|
|
def get_text_fitz(pdf_doc, page_num):
|
|
|
|
|
return pdf_doc.load_page(page_num).get_text() or ""
|
|
|
|
|
# 定义一个内部函数来绑定 pdf_document 对象
|
|
|
|
|
def create_get_text_function(pdf_lib, pdf_doc):
|
|
|
|
|
if pdf_lib == 'pypdf2':
|
|
|
|
|
return lambda page_num: get_text_pypdf2(pdf_doc, page_num)
|
|
|
|
|
elif pdf_lib == 'fitz':
|
|
|
|
|
return lambda page_num: get_text_fitz(pdf_doc, page_num)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("不支持的 PDF 库。")
|
|
|
|
|
|
|
|
|
|
|
2025-01-03 17:36:23 +08:00
|
|
|
|
def extract_common_header(pdf_path):
|
2025-01-21 14:21:49 +08:00
|
|
|
|
"""
|
|
|
|
|
提取 PDF 文件的公共页眉。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- pdf_path: PDF 文件路径。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- common_header: 公共页眉内容,字符串。如果未找到,则返回空字符串。
|
|
|
|
|
"""
|
|
|
|
|
|
2025-02-16 18:09:45 +08:00
|
|
|
|
def get_headers(get_text, start_page, pages_to_read,total_pages):
|
2024-11-01 14:28:10 +08:00
|
|
|
|
headers = []
|
2025-02-16 18:09:45 +08:00
|
|
|
|
for i in range(start_page, min(start_page + pages_to_read,total_pages)):
|
2025-01-21 14:21:49 +08:00
|
|
|
|
try:
|
2025-02-16 18:09:45 +08:00
|
|
|
|
text=get_text(i)
|
2025-01-21 14:21:49 +08:00
|
|
|
|
if text:
|
|
|
|
|
# 只取每页的前三行,去除前后的空白字符
|
|
|
|
|
first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
|
|
|
|
headers.append(first_lines)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"提取第 {i} 页文本时出错: {e}")
|
|
|
|
|
continue
|
2024-11-01 14:28:10 +08:00
|
|
|
|
return headers
|
|
|
|
|
|
|
|
|
|
def find_common_headers(headers):
|
|
|
|
|
if not headers:
|
|
|
|
|
return []
|
|
|
|
|
|
2025-01-03 17:36:23 +08:00
|
|
|
|
# 转置,使得每一行对应所有页的同一行
|
|
|
|
|
transposed_headers = list(zip(*headers))
|
2024-11-01 14:28:10 +08:00
|
|
|
|
common_headers = []
|
2025-01-03 17:36:23 +08:00
|
|
|
|
|
|
|
|
|
for lines in transposed_headers:
|
|
|
|
|
# 将每行按空格分割成部分
|
|
|
|
|
split_lines = [line.split() for line in lines]
|
|
|
|
|
|
|
|
|
|
# 找出所有行中最短的部分数
|
|
|
|
|
min_parts = min(len(parts) for parts in split_lines)
|
|
|
|
|
if min_parts == 0:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
common_parts = []
|
|
|
|
|
for part_idx in range(min_parts):
|
|
|
|
|
# 获取当前部分在所有行中的值
|
|
|
|
|
current_parts = [parts[part_idx] for parts in split_lines]
|
|
|
|
|
# 检查所有部分是否相同
|
|
|
|
|
if all(part == current_parts[0] for part in current_parts[1:]):
|
|
|
|
|
common_parts.append(current_parts[0])
|
|
|
|
|
else:
|
|
|
|
|
break # 如果某部分不相同,停止进一步比较
|
|
|
|
|
|
|
|
|
|
if common_parts:
|
|
|
|
|
# 将共同的部分重新组合成字符串
|
|
|
|
|
common_header_line = ' '.join(common_parts)
|
|
|
|
|
if len(common_header_line) >= 5: # 可以根据实际情况调整最小长度
|
|
|
|
|
common_headers.append(common_header_line)
|
|
|
|
|
|
2024-11-01 14:28:10 +08:00
|
|
|
|
return common_headers
|
|
|
|
|
|
2025-02-16 18:09:45 +08:00
|
|
|
|
def extract_common_header_main(get_text,total_pages):
|
2025-01-21 14:21:49 +08:00
|
|
|
|
# 定义两个提取策略
|
|
|
|
|
strategies = []
|
|
|
|
|
if total_pages >= 3:
|
|
|
|
|
# 策略1:中间的3页
|
|
|
|
|
middle_page = total_pages // 2
|
|
|
|
|
start_page = max(0, middle_page - 1)
|
|
|
|
|
strategies.append((start_page, 3))
|
|
|
|
|
elif total_pages == 2:
|
|
|
|
|
# 策略1:2页
|
|
|
|
|
strategies.append((0, 2))
|
|
|
|
|
else:
|
|
|
|
|
# 策略1:1页
|
|
|
|
|
strategies.append((0, 1))
|
|
|
|
|
|
|
|
|
|
# 策略2:前三页
|
|
|
|
|
if total_pages >= 3:
|
|
|
|
|
strategies.append((0, 3))
|
|
|
|
|
elif total_pages == 2:
|
|
|
|
|
strategies.append((0, 2))
|
|
|
|
|
elif total_pages == 1:
|
|
|
|
|
strategies.append((0, 1))
|
|
|
|
|
|
|
|
|
|
common_headers = []
|
|
|
|
|
|
2025-02-16 18:09:45 +08:00
|
|
|
|
for idx, (start, pages_to_read) in enumerate(strategies):
|
|
|
|
|
headers = get_headers(get_text, start, pages_to_read,total_pages)
|
2025-01-21 14:21:49 +08:00
|
|
|
|
if len(headers) < 2:
|
|
|
|
|
continue # 需要至少2页来比较
|
|
|
|
|
|
|
|
|
|
current_common = find_common_headers(headers)
|
|
|
|
|
if current_common:
|
|
|
|
|
common_headers = current_common
|
2025-01-21 15:53:31 +08:00
|
|
|
|
# print(f"使用策略{idx + 1}找到共同的页眉: {common_headers}")
|
2025-01-21 14:21:49 +08:00
|
|
|
|
break # 找到共同部分后退出
|
|
|
|
|
# 如果没有找到,继续下一个策略
|
|
|
|
|
return '\n'.join(common_headers)
|
|
|
|
|
|
2025-02-16 18:09:45 +08:00
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 尝试使用 PyPDF2 读取 PDF
|
|
|
|
|
try:
|
|
|
|
|
with open(pdf_path, "rb") as pdf_file:
|
|
|
|
|
pdf_document = PdfReader(pdf_file)
|
|
|
|
|
total_pages = len(pdf_document.pages)
|
|
|
|
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
|
|
|
|
return extract_common_header_main(get_text,total_pages)
|
|
|
|
|
except Exception as e_pypdf2:
|
|
|
|
|
print(f"extract_common_header:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
|
|
|
|
try:
|
|
|
|
|
with fitz.open(pdf_path) as pdf_document:
|
|
|
|
|
total_pages = pdf_document.page_count
|
|
|
|
|
get_text = create_get_text_function('fitz', pdf_document)
|
|
|
|
|
return extract_common_header_main(get_text,total_pages)
|
|
|
|
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
|
|
|
|
except Exception as e_fitz:
|
|
|
|
|
print(f"extract_common_header:使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
|
|
|
|
return "" # 或者根据需求抛出异常
|
|
|
|
|
|
2025-01-24 10:48:59 +08:00
|
|
|
|
# 执行策略1和策略2:先执行策略1(中间3页或根据页数调整),然后执行策略2(前三页或根据页数调整)。
|
|
|
|
|
# 获取两个策略的公共前缀:分别获取两个策略提取的公共页眉。
|
|
|
|
|
# 选择最长的公共前缀:如果两个策略的前缀之间存在包含关系,选择最长的那个。如果没有包含关系,则返回策略1提取的前缀。
|
|
|
|
|
# headers_results = []
|
|
|
|
|
#
|
|
|
|
|
# for idx, (start, count) in enumerate(strategies):
|
|
|
|
|
# headers = get_headers(pdf_document, start, count, is_pypdf2)
|
|
|
|
|
# if len(headers) < 2:
|
|
|
|
|
# continue # 需要至少2页来比较
|
|
|
|
|
#
|
|
|
|
|
# current_common = find_common_headers(headers)
|
|
|
|
|
# if current_common:
|
|
|
|
|
# headers_results.append(current_common)
|
|
|
|
|
# # 不再中断,继续执行下一个策略
|
|
|
|
|
# # 如果没有找到,继续下一个策略
|
|
|
|
|
#
|
|
|
|
|
# if not headers_results:
|
|
|
|
|
# return "" # 没有找到任何公共页眉
|
|
|
|
|
# # 现在有两个(或一个)策略的公共页眉,选择最长的公共前缀
|
|
|
|
|
# # 首先,取出所有策略的公共页眉,并将其转换为单个字符串
|
|
|
|
|
# headers_strategies = ['\n'.join(headers) for headers in headers_results]
|
|
|
|
|
# # 找到最长的公共前缀
|
|
|
|
|
# longest_common = max(headers_strategies, key=lambda s: len(s))
|
|
|
|
|
#
|
|
|
|
|
# # 检查其他策略的前缀是否是最长前缀的子集
|
|
|
|
|
# is_contained = all(longest_common.startswith(other) for other in headers_strategies)
|
|
|
|
|
#
|
|
|
|
|
# if is_contained:
|
|
|
|
|
# # 如果所有策略的前缀都是最长前缀的子集,返回最长前缀
|
|
|
|
|
# return longest_common
|
|
|
|
|
# else:
|
|
|
|
|
# # 如果没有包含关系,返回策略1的前缀(即第一个策略的结果)
|
|
|
|
|
# return headers_strategies[0]
|
|
|
|
|
|
2025-01-21 14:21:49 +08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error in extract_common_header: {e}")
|
|
|
|
|
return "" # 根据需求调整返回值
|
2024-11-01 14:28:10 +08:00
|
|
|
|
|
|
|
|
|
def clean_page_content(text, common_header):
|
|
|
|
|
# 首先删除抬头公共部分
|
|
|
|
|
if common_header: # 确保有公共抬头才进行替换
|
|
|
|
|
for header_line in common_header.split('\n'):
|
|
|
|
|
if header_line.strip(): # 只处理非空行
|
|
|
|
|
# 替换首次出现的完整行
|
|
|
|
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
2025-01-03 17:36:23 +08:00
|
|
|
|
# 预处理:删除文本开头的所有空白字符(包括空格、制表符等)
|
|
|
|
|
text = text.lstrip()
|
2025-01-14 17:10:38 +08:00
|
|
|
|
# 删除文本开头的“第 x 页”格式的页码
|
|
|
|
|
text = re.sub(r'^第\s*\d+\s*页\s*', '', text)
|
2024-11-01 14:28:10 +08:00
|
|
|
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
2024-12-06 14:40:22 +08:00
|
|
|
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
2024-12-09 17:38:01 +08:00
|
|
|
|
text = re.sub(r'^\s*\/?\s*(共\s*)?\d+\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
2024-12-26 17:20:27 +08:00
|
|
|
|
text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text) # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
|
2024-12-05 15:01:37 +08:00
|
|
|
|
return text
|
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
|
2025-02-13 15:42:52 +08:00
|
|
|
|
def read_page_text_with_timeout(page, timeout=3):
|
|
|
|
|
"""
|
2025-02-14 15:46:07 +08:00
|
|
|
|
线程安全的带超时文本提取(改进版)
|
|
|
|
|
特性:
|
|
|
|
|
1. 使用Queue保证线程间通信安全
|
|
|
|
|
2. 添加双重超时控制
|
|
|
|
|
3. 完善的资源清理机制
|
2024-12-06 09:25:24 +08:00
|
|
|
|
"""
|
2025-02-14 15:46:07 +08:00
|
|
|
|
result_queue = Queue(maxsize=1)
|
|
|
|
|
done_flag = threading.Event()
|
2025-02-13 15:42:52 +08:00
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
def extract_worker():
|
2025-02-13 15:42:52 +08:00
|
|
|
|
try:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# 强制设置文本提取上限(防止底层库卡死)
|
|
|
|
|
txt = page.extract_text() or ""
|
|
|
|
|
result_queue.put(txt)
|
2025-02-13 15:42:52 +08:00
|
|
|
|
except Exception as e:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
print(f"文本提取异常: {str(e)}")
|
|
|
|
|
result_queue.put("")
|
2025-02-13 15:42:52 +08:00
|
|
|
|
finally:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
done_flag.set() # 确保事件触发
|
2025-02-13 15:42:52 +08:00
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# 启动工作线程
|
|
|
|
|
worker = threading.Thread(
|
|
|
|
|
target=extract_worker,
|
|
|
|
|
daemon=True,
|
|
|
|
|
name=f"PDFTextExtract-{id(page)}"
|
|
|
|
|
)
|
|
|
|
|
worker.start()
|
|
|
|
|
|
|
|
|
|
# 双重等待机制
|
|
|
|
|
try:
|
|
|
|
|
# 第一阶段:等待事件标志
|
|
|
|
|
if done_flag.wait(timeout):
|
|
|
|
|
# 第二阶段:立即获取结果
|
|
|
|
|
return result_queue.get_nowait()
|
2024-12-06 09:25:24 +08:00
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# 超时处理流程
|
|
|
|
|
print(f"文本提取超时({timeout}s),终止操作")
|
2025-02-13 15:42:52 +08:00
|
|
|
|
return None
|
2024-12-06 09:25:24 +08:00
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
except Empty:
|
|
|
|
|
# 罕见情况:事件触发但队列无数据
|
|
|
|
|
print("队列数据异常,返回空文本")
|
|
|
|
|
return ""
|
|
|
|
|
finally:
|
|
|
|
|
# 资源清理
|
|
|
|
|
if worker.is_alive():
|
|
|
|
|
try:
|
|
|
|
|
worker._stop() # 强制终止(慎用)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
2025-02-13 15:42:52 +08:00
|
|
|
|
|
|
|
|
|
def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
|
|
|
|
|
"""
|
|
|
|
|
检查 PDF 是否为扫描件。逻辑:
|
|
|
|
|
1. 逐页读取,若某页提取文本超时(>page_timeout秒),直接判定为 True(假设超时为扫描件)。
|
|
|
|
|
2. 如果正常提取到文本且文本不为空,则判定为 False(非扫描件),立即返回。
|
|
|
|
|
3. 如果前 max_pages 页都检测完成,均无可见文本,则返回 True(认为是扫描件)。
|
|
|
|
|
4. 如果需要整体超时(overall_timeout),则在最外层加一个封装线程进行控制。
|
2024-12-06 09:25:24 +08:00
|
|
|
|
"""
|
2025-02-14 15:46:07 +08:00
|
|
|
|
result_queue = queue.Queue()
|
|
|
|
|
done_event = threading.Event()
|
|
|
|
|
def core_check():
|
2025-02-13 15:42:52 +08:00
|
|
|
|
try:
|
|
|
|
|
with open(file_path, 'rb') as f:
|
|
|
|
|
reader = PdfReader(f)
|
|
|
|
|
for i, page in enumerate(reader.pages):
|
2025-02-14 15:46:07 +08:00
|
|
|
|
if i >= max_pages or done_event.is_set():
|
2025-02-13 15:42:52 +08:00
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
text = read_page_text_with_timeout(page, page_timeout)
|
|
|
|
|
if text is None:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
print(f"Page {i + 1} timeout")
|
|
|
|
|
result_queue.put(True)
|
2025-02-13 15:42:52 +08:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if text.strip():
|
2025-02-14 15:46:07 +08:00
|
|
|
|
print(f"Text found on page {i + 1}")
|
|
|
|
|
result_queue.put(False)
|
2025-02-13 15:42:52 +08:00
|
|
|
|
return
|
2025-02-14 15:46:07 +08:00
|
|
|
|
|
|
|
|
|
result_queue.put(True)
|
2025-02-13 15:42:52 +08:00
|
|
|
|
except Exception as e:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
print(f"Processing error: {str(e)}")
|
|
|
|
|
result_queue.put(True)
|
|
|
|
|
finally:
|
2025-02-13 15:42:52 +08:00
|
|
|
|
done_event.set()
|
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
thread = threading.Thread(target=core_check, daemon=True)
|
2025-02-13 15:42:52 +08:00
|
|
|
|
thread.start()
|
2025-02-14 15:46:07 +08:00
|
|
|
|
try:
|
|
|
|
|
result = result_queue.get(timeout=overall_timeout)
|
|
|
|
|
except queue.Empty:
|
|
|
|
|
print(f"Overall timeout after {overall_timeout}s")
|
|
|
|
|
return True # 或定义特殊超时状态码
|
2025-02-13 15:42:52 +08:00
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
return result
|
2025-02-13 15:42:52 +08:00
|
|
|
|
|
|
|
|
|
def is_pure_image(docx_path, percentage=0.3):
|
|
|
|
|
"""
|
|
|
|
|
判断 docx 文件是否为纯图片。
|
|
|
|
|
先计算文档中段落数量,然后取前 percentage(默认30%)的段落进行判断。
|
|
|
|
|
如果这些段落中没有文本,则视为纯图片。
|
|
|
|
|
"""
|
|
|
|
|
document = Document(docx_path)
|
|
|
|
|
paragraphs = document.paragraphs
|
|
|
|
|
total_paragraphs = len(paragraphs)
|
|
|
|
|
# 计算需要判断的段落数,至少判断1个段落
|
|
|
|
|
check_count = max(1, int(total_paragraphs * percentage))
|
|
|
|
|
|
|
|
|
|
# 判断这部分段落是否含有文本
|
|
|
|
|
for paragraph in paragraphs[:check_count]:
|
|
|
|
|
if paragraph.text.strip():
|
|
|
|
|
return False
|
|
|
|
|
return True
|
2024-12-06 09:25:24 +08:00
|
|
|
|
|
2024-12-05 15:01:37 +08:00
|
|
|
|
if __name__ == '__main__':
|
2025-02-16 18:09:45 +08:00
|
|
|
|
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\971a834e-5cf5-4259-ad85-320aaca051e0\ztbfile.pdf"
|
2025-02-13 15:42:52 +08:00
|
|
|
|
res=is_scanned_pdf(pdf_path)
|
2024-12-05 15:01:37 +08:00
|
|
|
|
if res:
|
|
|
|
|
print("扫描型")
|
|
|
|
|
else:
|
|
|
|
|
print("普通型")
|