zbparse/flask_app/general/截取pdf通用函数.py

58 lines
2.2 KiB
Python
Raw Normal View History

2024-12-17 18:44:58 +08:00
#flask_app/general/截取pdf通用函数.py
import concurrent.futures
import os
import regex
from PyPDF2 import PdfReader
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
def get_start_and_common_header(input_path,end_page):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > end_page:
return common_header, 0 # 如果页码大于end_page直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def check_pdf_pages(pdf_path, mode,logger):
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info("PDF页数小于或等于50页跳过切分逻辑。")
if mode==1:
return ['', '', '', '', '', '', '']
else:
return ['','','','']
# 若页数大于50页返回None表示继续处理
return None
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
# 返回空列表意味着无法执行后续处理逻辑
if mode == 1:
return ['', '', '', '', '', '', '']
else:
return ['', '', '', '']