zbparse/flask_app/general/截取pdf通用函数.py

58 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#flask_app/general/截取pdf通用函数.py
import concurrent.futures
import os
import regex
from PyPDF2 import PdfReader
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
def get_start_and_common_header(input_path,end_page):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > end_page:
return common_header, 0 # 如果页码大于end_page直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def check_pdf_pages(pdf_path, mode,logger):
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info("PDF页数小于或等于50页跳过切分逻辑。")
if mode==1:
return ['', '', '', '', '', '', '']
else:
return ['','','','']
# 若页数大于50页返回None表示继续处理
return None
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
# 返回空列表意味着无法执行后续处理逻辑
if mode == 1:
return ['', '', '', '', '', '', '']
else:
return ['', '', '', '']