58 lines
2.2 KiB
Python
58 lines
2.2 KiB
Python
#flask_app/general/截取pdf通用函数.py
|
||
import concurrent.futures
|
||
import os
|
||
|
||
import regex
|
||
from PyPDF2 import PdfReader
|
||
|
||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||
|
||
def get_start_and_common_header(input_path,end_page):
|
||
common_header = extract_common_header(input_path)
|
||
last_begin_index = 0
|
||
begin_pattern = regex.compile(
|
||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
# 新增目录匹配模式
|
||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||
|
||
pdf_document = PdfReader(input_path)
|
||
for i, page in enumerate(pdf_document.pages):
|
||
if i > end_page:
|
||
return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0
|
||
text = page.extract_text()
|
||
if text:
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
# 检查是否存在"目录"
|
||
if catalog_pattern.search(cleaned_text):
|
||
continue # 如果存在目录,跳过当前页面
|
||
if begin_pattern.search(cleaned_text):
|
||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||
return common_header, last_begin_index
|
||
return common_header, last_begin_index
|
||
|
||
def check_pdf_pages(pdf_path, mode,logger):
|
||
try:
|
||
reader = PdfReader(pdf_path)
|
||
num_pages = len(reader.pages)
|
||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||
if num_pages <= 50:
|
||
logger.info("PDF页数小于或等于50页,跳过切分逻辑。")
|
||
if mode==1:
|
||
return ['', '', '', '', '', '', '']
|
||
else:
|
||
return ['','','','']
|
||
# 若页数大于50页,返回None表示继续处理
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"无法读取 PDF 页数: {e}")
|
||
# 返回空列表意味着无法执行后续处理逻辑
|
||
if mode == 1:
|
||
return ['', '', '', '', '', '', '']
|
||
else:
|
||
return ['', '', '', '']
|
||
|
||
|
||
|