截取pdf工程/货物尝试合并
This commit is contained in:
parent
d3425adcfb
commit
6c5163b2fb
@ -259,6 +259,246 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page):
|
|||||||
print(f"处理文件 {file_path} 时发生错误: {e}")
|
print(f"处理文件 {file_path} 时发生错误: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||||
|
output_suffix="normal"):
|
||||||
|
start_page = None
|
||||||
|
end_page = None
|
||||||
|
flag=True
|
||||||
|
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
if output_suffix == "tobidders_notice":
|
||||||
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
|
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
|
||||||
|
flag=False
|
||||||
|
continue
|
||||||
|
if begin_page==0 and catalog_pattern.search(cleaned_text):
|
||||||
|
continue # 如果存在目录,跳过当前页面
|
||||||
|
else:
|
||||||
|
#一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern
|
||||||
|
if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
|
||||||
|
flag=False
|
||||||
|
continue
|
||||||
|
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
||||||
|
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
||||||
|
start_page = i
|
||||||
|
continue
|
||||||
|
if start_page is not None:
|
||||||
|
if output_suffix == "tobidders_notice":
|
||||||
|
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
return start_page, end_page
|
||||||
|
|
||||||
|
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
|
||||||
|
"""
|
||||||
|
从PDF文档中提取起始页、中间页和结束页。
|
||||||
|
|
||||||
|
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
pdf_document (PDFDocument): 要处理的PDF文档对象。
|
||||||
|
begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
||||||
|
begin_page (int): 开始搜索的页码。
|
||||||
|
common_header (str): 每页需要清理的公共头部文本。
|
||||||
|
exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def run_extraction(local_begin_pattern, local_end_pattern=None):
|
||||||
|
"""
|
||||||
|
使用提供的 begin 和 end 模式运行提取过程。
|
||||||
|
|
||||||
|
如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
||||||
|
local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
tuple: (start_page, mid_page, end_page)
|
||||||
|
"""
|
||||||
|
start_page = None
|
||||||
|
mid_page = None
|
||||||
|
end_page = None
|
||||||
|
combined_mid_pattern = None # 中间页的组合模式
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
|
for i, page in enumerate(pdf_document.pages):
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
|
||||||
|
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
||||||
|
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||||
|
continue
|
||||||
|
if begin_page==0 and catalog_pattern.search(cleaned_text):
|
||||||
|
continue # 如果存在目录,跳过当前页面
|
||||||
|
|
||||||
|
# 识别起始页
|
||||||
|
if start_page is None:
|
||||||
|
match = regex.search(local_begin_pattern, cleaned_text)
|
||||||
|
if match and i > begin_page:
|
||||||
|
start_page = i
|
||||||
|
matched_text:str = match.group(0) # 获取整个匹配的文本
|
||||||
|
# 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成
|
||||||
|
if not local_end_pattern:
|
||||||
|
if '章' in matched_text:
|
||||||
|
chapter_type = '章'
|
||||||
|
elif '部分' in matched_text:
|
||||||
|
chapter_type = '部分'
|
||||||
|
else:
|
||||||
|
chapter_type = None # 未匹配到“章”或“部分”
|
||||||
|
|
||||||
|
if chapter_type:
|
||||||
|
# 根据 chapter_type 动态生成 end_pattern
|
||||||
|
end_pattern_dynamic = regex.compile(
|
||||||
|
rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
# 根据 chapter_type 动态生成 additional_mid_pattern
|
||||||
|
if chapter_type == '章':
|
||||||
|
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
|
||||||
|
elif chapter_type == '部分':
|
||||||
|
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
|
||||||
|
else:
|
||||||
|
additional_mid_pattern = ''
|
||||||
|
|
||||||
|
# 定义基础的 mid_pattern
|
||||||
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||||
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
||||||
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人)\s*须知正文\s*$'
|
||||||
|
|
||||||
|
# 合并基础模式和额外模式
|
||||||
|
if additional_mid_pattern:
|
||||||
|
combined_mid_pattern = regex.compile(
|
||||||
|
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
combined_mid_pattern = regex.compile(
|
||||||
|
rf'{base_mid_pattern}',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
|
||||||
|
end_pattern_dynamic = regex.compile(
|
||||||
|
r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
# 定义基础的 mid_pattern
|
||||||
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||||
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||||
|
combined_mid_pattern = regex.compile(
|
||||||
|
rf'{base_mid_pattern}',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
||||||
|
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
||||||
|
combined_mid_pattern = regex.compile(
|
||||||
|
rf'{base_mid_pattern}',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 识别中间页
|
||||||
|
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||||
|
# if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
|
||||||
|
# continue
|
||||||
|
if regex.search(combined_mid_pattern, cleaned_text):
|
||||||
|
mid_page = i
|
||||||
|
|
||||||
|
# 识别结束页
|
||||||
|
if start_page is not None and mid_page is not None:
|
||||||
|
# 使用提供的 end_pattern 或动态生成的 end_pattern
|
||||||
|
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
|
||||||
|
if regex.search(current_end_pattern, cleaned_text):
|
||||||
|
if i > mid_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
|
||||||
|
return start_page, mid_page, end_page
|
||||||
|
|
||||||
|
# 第一次提取尝试,使用初始的 begin_pattern
|
||||||
|
start_page, mid_page, end_page = run_extraction(begin_pattern)
|
||||||
|
|
||||||
|
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
|
||||||
|
if not (start_page and mid_page and end_page):
|
||||||
|
print(f"第二次尝试 tobidders_notice!")
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||||
|
if start_page and end_page and mid_page:
|
||||||
|
return start_page, mid_page, end_page
|
||||||
|
else:
|
||||||
|
# 定义新的 begin_pattern 和 end_pattern
|
||||||
|
new_begin_pattern = regex.compile(
|
||||||
|
r'.*(?:投标人|磋商|谈判|供应商|应答人)须知\s*$|'
|
||||||
|
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人)须知前附表',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
new_end_pattern = regex.compile(
|
||||||
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
print("第三次尝试 tobidders_notice! ")
|
||||||
|
# 第二次提取尝试,使用新的模式
|
||||||
|
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
|
||||||
|
|
||||||
|
return start_page, mid_page, end_page
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
||||||
|
output_suffix = "tobidders_notice"
|
||||||
|
begin_pattern = regex.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人)须知)+',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
end_pattern = regex.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
||||||
|
)
|
||||||
|
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
||||||
|
|
||||||
|
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
||||||
|
|
||||||
|
|
||||||
|
# 提取第一部分
|
||||||
|
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
|
||||||
|
if start_page1 is None or end_page1 is None:
|
||||||
|
return "", "", ""
|
||||||
|
|
||||||
|
# 提取第二部分
|
||||||
|
start_page2 = end_page1
|
||||||
|
|
||||||
|
# 检查end_page1页面的内容
|
||||||
|
text = pdf_document.pages[end_page1].extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
match = end_pattern.search(cleaned_text)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
# 获取匹配到的中文部分
|
||||||
|
chapter_title = match.group(1)
|
||||||
|
# 检查是否包含排除关键词
|
||||||
|
if any(word in chapter_title for word in exclusion_words):
|
||||||
|
# 如果包含排除关键词,直接返回相同的路径
|
||||||
|
return start_page1, end_page1, end_page1
|
||||||
|
|
||||||
|
# 如果不包含排除关键词,继续提取第二部分
|
||||||
|
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||||
|
exclusion_pattern, output_suffix)
|
||||||
|
|
||||||
|
if end_page2 is None:
|
||||||
|
return start_page1, end_page1, ""
|
||||||
|
|
||||||
|
return start_page1, end_page1, end_page2
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf'
|
file_path=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf'
|
||||||
output_folder=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp'
|
output_folder=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp'
|
||||||
|
@ -139,25 +139,18 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
for i in range(len(pdf_document.pages)):
|
for i in range(len(pdf_document.pages)):
|
||||||
page = pdf_document.pages[i]
|
page = pdf_document.pages[i]
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
if flag and regex.search(exclusion_pattern, cleaned_text):
|
||||||
if flag and regex.search(exclusion_pattern, cleaned_text):
|
flag = False
|
||||||
flag = False
|
continue
|
||||||
|
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
||||||
|
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
||||||
|
start_page = i
|
||||||
continue
|
continue
|
||||||
if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
|
if start_page is not None:
|
||||||
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
|
if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
|
||||||
pass
|
end_page = i
|
||||||
else:
|
break
|
||||||
start_page = i
|
|
||||||
if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,
|
|
||||||
cleaned_text):
|
|
||||||
condition = i > start_page
|
|
||||||
if condition:
|
|
||||||
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
|
||||||
if is_invalid_condition or output_suffix != "invalid":
|
|
||||||
end_page = i
|
|
||||||
break # 找到结束页后退出循环
|
|
||||||
|
|
||||||
# 移除对 extract_pages_twice 的调用
|
# 移除对 extract_pages_twice 的调用
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
|
@ -4,51 +4,14 @@ import os # 用于文件和文件夹操作
|
|||||||
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
||||||
from flask_app.general.merge_pdfs import merge_and_cleanup
|
from flask_app.general.merge_pdfs import merge_and_cleanup
|
||||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
||||||
convert_to_pdf, get_invalid_file
|
convert_to_pdf, get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic
|
||||||
from flask_app.general.通用功能函数 import get_global_logger
|
from flask_app.general.通用功能函数 import get_global_logger
|
||||||
|
|
||||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
|
||||||
output_suffix="normal"):
|
|
||||||
start_page = None
|
|
||||||
end_page = None
|
|
||||||
flag=True
|
|
||||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
|
||||||
text = page.extract_text() or ""
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
|
||||||
if output_suffix == "tobidders_notice":
|
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
|
||||||
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
|
|
||||||
flag=False
|
|
||||||
continue
|
|
||||||
if begin_page==0 and catalog_pattern.search(cleaned_text):
|
|
||||||
continue # 如果存在目录,跳过当前页面
|
|
||||||
else:
|
|
||||||
#一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern
|
|
||||||
if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
|
|
||||||
flag=False
|
|
||||||
continue
|
|
||||||
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
|
||||||
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
|
||||||
start_page = i
|
|
||||||
continue
|
|
||||||
if start_page is not None:
|
|
||||||
if output_suffix == "tobidders_notice":
|
|
||||||
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
|
||||||
end_page = i
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
|
|
||||||
end_page = i
|
|
||||||
break
|
|
||||||
return start_page, end_page
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix,logger):
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix,logger):
|
||||||
try:
|
try:
|
||||||
common_header = extract_common_header(pdf_path)
|
common_header = extract_common_header(pdf_path)
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
exclusion_pattern = None
|
exclusion_pattern = None
|
||||||
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
|
||||||
|
|
||||||
if output_suffix == "tobidders_notice":
|
if output_suffix == "tobidders_notice":
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
@ -59,9 +22,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
if not start_page or not mid_page or not end_page:
|
if not start_page or not mid_page or not end_page:
|
||||||
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
|
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
|
||||||
return "", ""
|
return "", ""
|
||||||
# if start_page is None or end_page is None or mid_page is None:
|
|
||||||
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
|
||||||
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
|
|
||||||
path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header)
|
path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header)
|
||||||
if mid_page != end_page:
|
if mid_page != end_page:
|
||||||
path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header)
|
path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header)
|
||||||
@ -76,16 +36,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
||||||
common_header, exclusion_pattern, output_suffix)
|
common_header, exclusion_pattern, output_suffix)
|
||||||
# 针对 selection = 6 的特殊处理
|
|
||||||
if output_suffix == "format":
|
|
||||||
if start_page is None:
|
|
||||||
print(f"{output_suffix}: 未找到起始页,提取失败!")
|
|
||||||
return ""
|
|
||||||
if end_page is None:
|
|
||||||
# 如果未匹配到结束页,默认截取到文件末尾
|
|
||||||
end_page = total_pages
|
|
||||||
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。")
|
|
||||||
return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
|
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
|
||||||
@ -136,212 +86,6 @@ def get_patterns_for_notice():
|
|||||||
)
|
)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
|
|
||||||
"""
|
|
||||||
从PDF文档中提取起始页、中间页和结束页。
|
|
||||||
|
|
||||||
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
pdf_document (PDFDocument): 要处理的PDF文档对象。
|
|
||||||
begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
|
||||||
begin_page (int): 开始搜索的页码。
|
|
||||||
common_header (str): 每页需要清理的公共头部文本。
|
|
||||||
exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
|
|
||||||
|
|
||||||
返回:
|
|
||||||
tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def run_extraction(local_begin_pattern, local_end_pattern=None):
|
|
||||||
"""
|
|
||||||
使用提供的 begin 和 end 模式运行提取过程。
|
|
||||||
|
|
||||||
如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
|
||||||
local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
|
|
||||||
|
|
||||||
返回:
|
|
||||||
tuple: (start_page, mid_page, end_page)
|
|
||||||
"""
|
|
||||||
start_page = None
|
|
||||||
mid_page = None
|
|
||||||
end_page = None
|
|
||||||
combined_mid_pattern = None # 中间页的组合模式
|
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
|
||||||
for i, page in enumerate(pdf_document.pages):
|
|
||||||
text = page.extract_text() or ""
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
|
||||||
|
|
||||||
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
|
||||||
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
|
||||||
continue
|
|
||||||
if begin_page==0 and catalog_pattern.search(cleaned_text):
|
|
||||||
continue # 如果存在目录,跳过当前页面
|
|
||||||
|
|
||||||
# 识别起始页
|
|
||||||
if start_page is None:
|
|
||||||
match = regex.search(local_begin_pattern, cleaned_text)
|
|
||||||
if match and i > begin_page:
|
|
||||||
start_page = i
|
|
||||||
matched_text:str = match.group(0) # 获取整个匹配的文本
|
|
||||||
# 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成
|
|
||||||
if not local_end_pattern:
|
|
||||||
if '章' in matched_text:
|
|
||||||
chapter_type = '章'
|
|
||||||
elif '部分' in matched_text:
|
|
||||||
chapter_type = '部分'
|
|
||||||
else:
|
|
||||||
chapter_type = None # 未匹配到“章”或“部分”
|
|
||||||
|
|
||||||
if chapter_type:
|
|
||||||
# 根据 chapter_type 动态生成 end_pattern
|
|
||||||
end_pattern_dynamic = regex.compile(
|
|
||||||
rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
|
|
||||||
# 根据 chapter_type 动态生成 additional_mid_pattern
|
|
||||||
if chapter_type == '章':
|
|
||||||
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
|
|
||||||
elif chapter_type == '部分':
|
|
||||||
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
|
|
||||||
else:
|
|
||||||
additional_mid_pattern = ''
|
|
||||||
|
|
||||||
# 定义基础的 mid_pattern
|
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
|
||||||
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人)\s*须知正文\s*$'
|
|
||||||
|
|
||||||
# 合并基础模式和额外模式
|
|
||||||
if additional_mid_pattern:
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'{base_mid_pattern}',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
|
|
||||||
end_pattern_dynamic = regex.compile(
|
|
||||||
r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
|
|
||||||
# 定义基础的 mid_pattern
|
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'{base_mid_pattern}',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
|
||||||
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'{base_mid_pattern}',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 识别中间页
|
|
||||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
|
||||||
# if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
|
|
||||||
# continue
|
|
||||||
if regex.search(combined_mid_pattern, cleaned_text):
|
|
||||||
mid_page = i
|
|
||||||
|
|
||||||
# 识别结束页
|
|
||||||
if start_page is not None and mid_page is not None:
|
|
||||||
# 使用提供的 end_pattern 或动态生成的 end_pattern
|
|
||||||
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
|
|
||||||
if regex.search(current_end_pattern, cleaned_text):
|
|
||||||
if i > mid_page:
|
|
||||||
end_page = i
|
|
||||||
break
|
|
||||||
|
|
||||||
return start_page, mid_page, end_page
|
|
||||||
|
|
||||||
# 第一次提取尝试,使用初始的 begin_pattern
|
|
||||||
start_page, mid_page, end_page = run_extraction(begin_pattern)
|
|
||||||
|
|
||||||
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
|
|
||||||
if not (start_page and mid_page and end_page):
|
|
||||||
print(f"第二次尝试 tobidders_notice!")
|
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
|
||||||
if start_page and end_page and mid_page:
|
|
||||||
return start_page, mid_page, end_page
|
|
||||||
else:
|
|
||||||
# 定义新的 begin_pattern 和 end_pattern
|
|
||||||
new_begin_pattern = regex.compile(
|
|
||||||
r'.*(?:投标人|磋商|谈判|供应商|应答人)须知\s*$|'
|
|
||||||
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人)须知前附表',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
new_end_pattern = regex.compile(
|
|
||||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
print("第三次尝试 tobidders_notice! ")
|
|
||||||
# 第二次提取尝试,使用新的模式
|
|
||||||
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
|
|
||||||
|
|
||||||
return start_page, mid_page, end_page
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
|
||||||
output_suffix = "tobidders_notice"
|
|
||||||
begin_pattern = regex.compile(
|
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人)须知)+',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
end_pattern = regex.compile(
|
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
|
||||||
)
|
|
||||||
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
|
||||||
|
|
||||||
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
|
||||||
|
|
||||||
|
|
||||||
# 提取第一部分
|
|
||||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
|
|
||||||
if start_page1 is None or end_page1 is None:
|
|
||||||
return "", "", ""
|
|
||||||
|
|
||||||
# 提取第二部分
|
|
||||||
start_page2 = end_page1
|
|
||||||
|
|
||||||
# 检查end_page1页面的内容
|
|
||||||
text = pdf_document.pages[end_page1].extract_text() or ""
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
|
||||||
match = end_pattern.search(cleaned_text)
|
|
||||||
|
|
||||||
if match:
|
|
||||||
# 获取匹配到的中文部分
|
|
||||||
chapter_title = match.group(1)
|
|
||||||
# 检查是否包含排除关键词
|
|
||||||
if any(word in chapter_title for word in exclusion_words):
|
|
||||||
# 如果包含排除关键词,直接返回相同的路径
|
|
||||||
return start_page1, end_page1, end_page1
|
|
||||||
|
|
||||||
# 如果不包含排除关键词,继续提取第二部分
|
|
||||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
|
||||||
exclusion_pattern, output_suffix)
|
|
||||||
|
|
||||||
if end_page2 is None:
|
|
||||||
return start_page1, end_page1, ""
|
|
||||||
|
|
||||||
return start_page1, end_page1, end_page2
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||||
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user