112 lines
4.6 KiB
Python
112 lines
4.6 KiB
Python
#flask_app/general/截取pdf通用函数.py
|
||
import concurrent.futures
|
||
import os
|
||
|
||
import regex
|
||
from PyPDF2 import PdfReader, PdfWriter
|
||
|
||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||
from flask_app.general.format_change import docx2pdf
|
||
|
||
|
||
def get_start_and_common_header(input_path,end_page):
|
||
common_header = extract_common_header(input_path)
|
||
last_begin_index = 0
|
||
begin_pattern = regex.compile(
|
||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
# 新增目录匹配模式
|
||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||
|
||
pdf_document = PdfReader(input_path)
|
||
for i, page in enumerate(pdf_document.pages):
|
||
if i > end_page:
|
||
return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0
|
||
text = page.extract_text()
|
||
if text:
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
# 检查是否存在"目录"
|
||
if catalog_pattern.search(cleaned_text):
|
||
continue # 如果存在目录,跳过当前页面
|
||
if begin_pattern.search(cleaned_text):
|
||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||
return common_header, last_begin_index
|
||
return common_header, last_begin_index
|
||
|
||
def check_pdf_pages(pdf_path, logger):
|
||
try:
|
||
reader = PdfReader(pdf_path)
|
||
num_pages = len(reader.pages)
|
||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||
if num_pages <= 50:
|
||
logger.info("PDF页数小于或等于50页,跳过切分逻辑。")
|
||
return ['', '', '', '', '', '', '']
|
||
# 若页数大于50页,返回None表示继续处理
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"无法读取 PDF 页数: {e}")
|
||
# 返回空列表意味着无法执行后续处理逻辑
|
||
return ['', '', '', '', '', '', '']
|
||
|
||
|
||
def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
|
||
try:
|
||
# 检查 start_page 和 end_page 是否为 None
|
||
if start_page is None or end_page is None:
|
||
print("Error: start_page 或 end_page 为 None")
|
||
return ""
|
||
pdf_document = PdfReader(pdf_path)
|
||
total_pages = len(pdf_document.pages)
|
||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||
|
||
if start_page < 0 or end_page >= total_pages or start_page > end_page:
|
||
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
||
return ""
|
||
|
||
if output_suffix == 'notice' and start_page > 0:
|
||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||
before_doc = PdfWriter()
|
||
toc_page = -1
|
||
# 查找目录页
|
||
for page_num in range(min(start_page, total_pages)):
|
||
page_text = pdf_document.pages[page_num].extract_text()
|
||
cleaned_text = clean_page_content(page_text, common_header)
|
||
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
||
toc_page = page_num
|
||
break
|
||
|
||
# 确定截取的页数
|
||
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
|
||
# 提取页面
|
||
for page_num in range(pages_to_extract):
|
||
before_doc.add_page(pdf_document.pages[page_num])
|
||
print(before_pdf_path)
|
||
with open(before_pdf_path, 'wb') as f_before:
|
||
before_doc.write(f_before)
|
||
|
||
output_doc = PdfWriter()
|
||
for page_num in range(start_page, end_page + 1):
|
||
output_doc.add_page(pdf_document.pages[page_num])
|
||
with open(output_pdf_path, 'wb') as f:
|
||
output_doc.write(f)
|
||
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||
return output_pdf_path
|
||
except Exception as e:
|
||
print(f"Error in save_extracted_pages: {e}")
|
||
return "" # 返回空字符串
|
||
|
||
def is_pdf_or_doc(filename):
|
||
# 判断文件是否为PDF或Word文档
|
||
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||
|
||
|
||
def convert_to_pdf(file_path):
|
||
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
|
||
if file_path.lower().endswith(('.doc', '.docx')):
|
||
return docx2pdf(file_path)
|
||
return file_path
|
||
|
||
|