58 lines
2.2 KiB
Python
58 lines
2.2 KiB
Python
|
#flask_app/general/截取pdf通用函数.py
|
|||
|
import concurrent.futures
|
|||
|
import os
|
|||
|
|
|||
|
import regex
|
|||
|
from PyPDF2 import PdfReader
|
|||
|
|
|||
|
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
|||
|
|
|||
|
def get_start_and_common_header(input_path,end_page):
|
|||
|
common_header = extract_common_header(input_path)
|
|||
|
last_begin_index = 0
|
|||
|
begin_pattern = regex.compile(
|
|||
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
|||
|
regex.MULTILINE
|
|||
|
)
|
|||
|
# 新增目录匹配模式
|
|||
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
|||
|
|
|||
|
pdf_document = PdfReader(input_path)
|
|||
|
for i, page in enumerate(pdf_document.pages):
|
|||
|
if i > end_page:
|
|||
|
return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0
|
|||
|
text = page.extract_text()
|
|||
|
if text:
|
|||
|
cleaned_text = clean_page_content(text, common_header)
|
|||
|
# 检查是否存在"目录"
|
|||
|
if catalog_pattern.search(cleaned_text):
|
|||
|
continue # 如果存在目录,跳过当前页面
|
|||
|
if begin_pattern.search(cleaned_text):
|
|||
|
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
|||
|
return common_header, last_begin_index
|
|||
|
return common_header, last_begin_index
|
|||
|
|
|||
|
def check_pdf_pages(pdf_path, mode,logger):
|
|||
|
try:
|
|||
|
reader = PdfReader(pdf_path)
|
|||
|
num_pages = len(reader.pages)
|
|||
|
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
|||
|
if num_pages <= 50:
|
|||
|
logger.info("PDF页数小于或等于50页,跳过切分逻辑。")
|
|||
|
if mode==1:
|
|||
|
return ['', '', '', '', '', '', '']
|
|||
|
else:
|
|||
|
return ['','','','']
|
|||
|
# 若页数大于50页,返回None表示继续处理
|
|||
|
return None
|
|||
|
except Exception as e:
|
|||
|
logger.error(f"无法读取 PDF 页数: {e}")
|
|||
|
# 返回空列表意味着无法执行后续处理逻辑
|
|||
|
if mode == 1:
|
|||
|
return ['', '', '', '', '', '', '']
|
|||
|
else:
|
|||
|
return ['', '', '', '']
|
|||
|
|
|||
|
|
|||
|
|