zbparse/flask_app/货物标/截取pdf货物标版.py
2024-12-11 17:42:51 +08:00

798 lines
40 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import glob
import logging
from PyPDF2 import PdfReader, PdfWriter
import regex # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
from flask_app.general.format_change import docx2pdf
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods
import concurrent.futures
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
def convert_to_pdf(file_path):
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
if file_path.lower().endswith(('.doc', '.docx')):
return docx2pdf(file_path)
return file_path
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
pdf_path = convert_to_pdf(file_path)
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if output_suffix == "tobidders_notice":
# 确保返回的是元组,并将其中的 None 转换为 ""
path1, path2 = result
return [path1 or "", path2 or ""]
elif output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return [result or ""]
return [result or ""]
return [""] # 返回空字符串
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"):
start_page = None
end_page = None
flag=True
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if output_suffix == "tobidders_notice":
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
flag=False
continue
else:
if exclusion_pattern and flag and regex.search(exclusion_pattern, cleaned_text):
flag=False
continue
if start_page is None and regex.search(begin_pattern, cleaned_text):
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
start_page = i
continue
if start_page is not None:
if output_suffix == "tobidders_notice":
if regex.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
else:
if regex.search(end_pattern, cleaned_text) and i > start_page and not regex.search(begin_pattern,cleaned_text):
end_page = i
break
return start_page, end_page
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
try:
common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path)
exclusion_pattern = None
total_pages = len(pdf_document.pages) - 1 # 获取总页数
if output_suffix == "tobidders_notice":
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|文件构成|文件组成')
start_page, mid_page, end_page = extract_pages_tobidders_notice(
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
)
if not start_page or not mid_page or not end_page:
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
return "", ""
# if start_page is None or end_page is None or mid_page is None:
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder,
"tobidders_notice_part1")
if mid_page != end_page:
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
"tobidders_notice_part2")
else:
path2=path1
return path1, path2
else:
# 原有的处理逻辑保持不变
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|文件构成|文件组成')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix)
# 针对 selection = 6 的特殊处理
if output_suffix == "format":
if start_page is None:
print(f"{output_suffix}: 未找到起始页,提取失败!")
return ""
if end_page is None:
# 如果未匹配到结束页,默认截取到文件末尾
end_page = total_pages
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。")
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page)
elif output_suffix == "qualification1":
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
def get_patterns_for_procurement():
# begin_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
# regex.MULTILINE)
begin_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
r'(?:采购|需求)[\u4e00-\u9fff、()]*?)\s*$', # 或者匹配“采购”或“需求”
regex.MULTILINE
)
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def get_patterns_for_evaluation_method():
begin_pattern = regex.compile(
r'(?:(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
r'(?:[\u4e00-\u9fff、()]*?)'
r'(?=.*(?:磋商|谈判|评标|评定|评审))'
r'(?=.*(?:办法|方法|内容))'
r'[\u4e00-\u9fff、()]*\s*$|'
r'^\s*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$)', # 第二种模式
regex.MULTILINE
)
end_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def get_patterns_for_notice():
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = regex.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
regex.MULTILINE
)
return begin_pattern, end_pattern
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
# exclusion_pattern):
# start_page = None
# mid_page = None
# end_page = None
# for i, page in enumerate(pdf_document.pages):
# text = page.extract_text() or ""
# cleaned_text = clean_page_content(text, common_header)
# if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
# continue
# if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
# start_page = i
# if start_page is not None and mid_page is None and regex.search(
# r'^\s*[(]?\s*[一1]\s*[)]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text):
# mid_page = i
# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
# end_page = i
# break
# return start_page, mid_page, end_page
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
"""
从PDF文档中提取起始页、中间页和结束页。
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
参数:
pdf_document (PDFDocument): 要处理的PDF文档对象。
begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
begin_page (int): 开始搜索的页码。
common_header (str): 每页需要清理的公共头部文本。
exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
返回:
tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None)
"""
def run_extraction(local_begin_pattern, local_end_pattern=None):
"""
使用提供的 begin 和 end 模式运行提取过程。
如果未提供 local_end_pattern则根据匹配的章节类型动态生成 end_pattern。
参数:
local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
返回:
tuple: (start_page, mid_page, end_page)
"""
start_page = None
mid_page = None
end_page = None
chapter_type = None # 用于存储“章”或“部分”
combined_mid_pattern = None # 中间页的组合模式
pdf_document = PdfReader(pdf_path)
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue
# 识别起始页
if start_page is None:
match = regex.search(local_begin_pattern, cleaned_text)
if match and i > begin_page:
start_page = i
matched_text = match.group(0) # 获取整个匹配的文本
# 如果未提供固定的 end_pattern则根据匹配的章节类型动态生成
if not local_end_pattern:
if '' in matched_text:
chapter_type = ''
elif '部分' in matched_text:
chapter_type = '部分'
else:
chapter_type = None # 未匹配到“章”或“部分”
if chapter_type:
# 根据 chapter_type 动态生成 end_pattern
end_pattern_dynamic = regex.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
regex.MULTILINE
)
# 根据 chapter_type 动态生成 additional_mid_pattern
if chapter_type == '':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
elif chapter_type == '部分':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
else:
additional_mid_pattern = ''
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
# 合并基础模式和额外模式
if additional_mid_pattern:
combined_mid_pattern = regex.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
regex.MULTILINE
)
else:
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
regex.MULTILINE
)
else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
end_pattern_dynamic = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
regex.MULTILINE
)
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
regex.MULTILINE
)
else:
# 如果提供了固定的 end_pattern则使用默认的 mid_pattern
base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
regex.MULTILINE
)
continue
# 识别中间页
if start_page is not None and mid_page is None and combined_mid_pattern:
if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
continue
if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i
# 识别结束页
if start_page is not None and mid_page is not None:
# 使用提供的 end_pattern 或动态生成的 end_pattern
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
if regex.search(current_end_pattern, cleaned_text):
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
# 第一次提取尝试,使用初始的 begin_pattern
start_page, mid_page, end_page = run_extraction(begin_pattern)
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
if not (start_page and mid_page and end_page):
print(f"第二次尝试 tobidders_notice!")
pdf_document = PdfReader(pdf_path)
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
if start_page and end_page and mid_page:
return start_page, mid_page, end_page
else:
# 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
regex.MULTILINE
)
new_end_pattern = regex.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第三次尝试 tobidders_notice! ")
# 第二次提取尝试,使用新的模式
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
return start_page, mid_page, end_page
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
output_suffix = "tobidders_notice"
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
regex.MULTILINE
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
)
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成')
# 提取第一部分
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
if start_page1 is None or end_page1 is None:
return "", "", ""
# 提取第二部分
start_page2 = end_page1
# 检查end_page1页面的内容
text = pdf_document.pages[end_page1].extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
match = end_pattern.search(cleaned_text)
if match:
# 获取匹配到的中文部分
chapter_title = match.group(1)
# 检查是否包含排除关键词
if any(word in chapter_title for word in exclusion_words):
# 如果包含排除关键词,直接返回相同的路径
return start_page1, end_page1, end_page1
# 如果不包含排除关键词,继续提取第二部分
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
exclusion_pattern, output_suffix)
if end_page2 is None:
return start_page1, end_page1, ""
return start_page1, end_page1, end_page2
def extract_pages_qualification(pdf_document, begin_page, common_header):
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = regex.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)',
regex.MULTILINE
)
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
priority_pattern = regex.compile(
r'^(资格性检查|资格审查|符合性审查)',
regex.MULTILINE
)
# 结束匹配模式 - 附录、附件、附表等
end_pattern_attachment = regex.compile(
r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$',
regex.MULTILINE
)
# 结束匹配模式 - 章节标题
end_pattern_chapter = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第二次尝试 qualification:匹配附件")
start_page = None
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"]
exclude_keywords = ["声明函", "承诺函"]
# 从指定的开始页开始遍历
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 优先检查是否匹配优先模式
priority_match = priority_pattern.search(cleaned_text)
if priority_match and start_page is None:
start_page = i
# print(f"匹配到优先模式,设置起始页为: {start_page}")
continue
else:
# 如果未匹配优先模式,则按照一般模式进行判断
if (
any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None
):
if begin_pattern.search(cleaned_text):
start_page = i
continue
# print(f"匹配到附录等模式,设置起始页为: {start_page}")
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keywords
if not any(keyword in cleaned_text for keyword in include_keywords):
if i > start_page:
end_page = i
# print(f"找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keywords继续查找。页面: {i}")
# 确定结束页 - 章节标题
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
print(f"找到结束页 (章节标题): {end_page}")
break # 找到结束页后退出循环
return start_page, end_page
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
try:
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
pdf_document = PdfReader(pdf_path)
patterns = None
start_page = None
end_page = None
if output_suffix == "procurement":
patterns = [get_patterns_for_procurement()]
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "notice":
patterns = [get_patterns_for_notice()]
elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
if patterns:
for pattern_pair in patterns:
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header, exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None:
break
if start_page is None or end_page is None:
if output_suffix == "qualification1":
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("第三次尝试资格审查:尝试提取评分办法章节...")
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf")
if os.path.isfile(evaluation_method_file):
print(f"找到评分办法章节文件: {evaluation_method_file},生成新文件。")
# 获取文件路径和文件名
new_file_name = f"{base_file_name}_qualification2.pdf"
new_file_path = os.path.join(output_folder, new_file_name)
# 复制文件
with open(evaluation_method_file, 'rb') as original_file:
with open(new_file_path, 'wb') as new_file:
new_file.write(original_file.read())
return new_file_path
else:
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
if len(temp) > 0:
return temp[0]
else:
return ""
else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return ""
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
try:
# 检查 start_page 和 end_page 是否为 None
if start_page is None or end_page is None:
print("Error: start_page 或 end_page 为 None")
return ""
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page}")
return ""
if output_suffix == 'notice' and start_page - 1 >= 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
before_doc = PdfWriter()
for page_num in range(0, start_page):
before_doc.add_page(pdf_document.pages[page_num])
with open(before_pdf_path, 'wb') as f:
before_doc.write(f)
# print(f"已保存页面从 0 到 {start_page - 1} 为 {before_pdf_path}")
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
except Exception as e:
print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 10:
return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
try:
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path):
result = process_input(file_path, output_folder, selection, output_suffix)
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result])
else:
generated_files.append(result)
return generated_files
# 单文件情况
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
return process_input(input_path, output_folder, selection, output_suffix)
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return ['']
except Exception as e:
print(f"Error in truncate_pdf_main: {e}")
return [''] # 返回空字符串
def process_input(input_path, output_folder, selection, output_suffix):
try:
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(input_path)
begin_page = last_begin_index if last_begin_index != 0 else {
4: 1,
2: 5,
3: 5,
1: 0,
5: 3
}.get(selection, 0)
# 根据选择设置对应的模式和结束模式
if selection == 1:
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$', regex.MULTILINE)
local_output_suffix = "notice"
elif selection == 2:
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "evaluation_method"
elif selection == 3:
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', regex.MULTILINE)
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
local_output_suffix = "qualification1"
elif selection == 4:
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
regex.MULTILINE)
end_pattern = None
local_output_suffix = "tobidders_notice"
elif selection == 5:
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明'
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "procurement"
# begin_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
# r'^[一二三四五六七八九十百千]+、\s*采购清单', regex.MULTILINE)
# end_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
elif selection == 6:
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
local_output_suffix = "format"
else:
print("无效的选择:请选择1-5")
return ['']
# 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix
if output_suffix == "default":
output_suffix = local_output_suffix
# 调用实际处理文件内容的函数
return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
except Exception as e:
print(f"Error in process_input: {e}")
return ['']
def truncate_pdf_multiple(pdf_path, output_folder, logger):
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
# 定义要处理的选择范围
selections = range(1, 6)
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
for selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
truncate_files.extend(files)
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
# logger.info(f"已生成合并文件: {merged_output_path}")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径" + str(truncate_files))
return truncate_files
# 小解析,只需要前三章内容
def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
"""
处理 PDF 文件,选择指定的 selections并合并结果。
Args:
pdf_path (str): 要处理的 PDF 文件路径。
output_folder (str): 截取后的文件保存文件夹路径。
selections (iterable): 要处理的 selection 列表。
Returns:
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
"""
logger = get_global_logger(unique_id)
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
for selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
if isinstance(files, list):
truncate_files.extend(files)
elif isinstance(files, str):
truncate_files.append(files)
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
# logger.info(f"已生成合并文件: {merged_output_path}")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败没有生成merged_baseinfo for {pdf_path}")
return truncate_files
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
# ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
input_path=r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
files = truncate_pdf_multiple(input_path, output_folder,logger)
# selections = [3,5]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
print(files)
# selection = 2 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)