2025-01-03 17:36:23 +08:00
|
|
|
|
# flask_app/工程标/截取pdf工程标版.py
|
2024-12-10 17:32:08 +08:00
|
|
|
|
import regex
|
2024-10-29 09:04:23 +08:00
|
|
|
|
import os
|
2024-10-31 15:03:32 +08:00
|
|
|
|
import time
|
2024-08-29 16:37:09 +08:00
|
|
|
|
from PyPDF2 import PdfReader, PdfWriter
|
2024-12-17 18:44:58 +08:00
|
|
|
|
from flask_app.general.clean_pdf import clean_page_content
|
2024-10-29 09:04:23 +08:00
|
|
|
|
|
2025-01-03 17:36:23 +08:00
|
|
|
|
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file
|
2024-12-17 18:44:58 +08:00
|
|
|
|
from flask_app.general.通用功能函数 import get_global_logger
|
2024-11-01 14:28:10 +08:00
|
|
|
|
|
2025-01-03 17:36:23 +08:00
|
|
|
|
|
2024-11-01 14:28:10 +08:00
|
|
|
|
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
|
|
|
|
|
is_secondary_match):
|
2024-10-29 20:40:14 +08:00
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
2025-01-03 17:36:23 +08:00
|
|
|
|
exclusion_pattern = regex.compile(
|
|
|
|
|
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
|
2024-10-29 09:04:23 +08:00
|
|
|
|
def run_extraction():
|
2024-09-04 17:49:05 +08:00
|
|
|
|
start_page = None
|
2024-10-29 09:04:23 +08:00
|
|
|
|
mid_page = None
|
2024-09-04 17:49:05 +08:00
|
|
|
|
end_page = None
|
2024-10-29 09:04:23 +08:00
|
|
|
|
chapter_type = None # 用于存储“章”或“部分”
|
2025-01-03 17:36:23 +08:00
|
|
|
|
combined_mid_pattern = None
|
2024-12-18 10:32:24 +08:00
|
|
|
|
end_pattern = None
|
2025-01-08 17:34:50 +08:00
|
|
|
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
2024-10-29 09:04:23 +08:00
|
|
|
|
for i, page in enumerate(pdf_document.pages):
|
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
cleaned_text = clean_page_content(text, common_header)
|
2024-09-04 17:49:05 +08:00
|
|
|
|
|
2025-01-03 09:59:53 +08:00
|
|
|
|
if mid_page is not None and exclusion_pattern and regex.search(exclusion_pattern, cleaned_text):
|
2024-10-29 09:04:23 +08:00
|
|
|
|
continue
|
2025-01-08 17:34:50 +08:00
|
|
|
|
if begin_page==0 and catalog_pattern.search(cleaned_text):
|
|
|
|
|
continue # 如果存在目录,跳过当前页面
|
2024-10-29 09:04:23 +08:00
|
|
|
|
if start_page is None:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
match = regex.search(begin_pattern, cleaned_text)
|
2024-10-29 09:04:23 +08:00
|
|
|
|
if match and i > begin_page:
|
|
|
|
|
start_page = i
|
2024-12-18 10:32:24 +08:00
|
|
|
|
matched_text: str = match.group(0) # 获取整个匹配的文本
|
2024-10-29 09:04:23 +08:00
|
|
|
|
if '章' in matched_text:
|
|
|
|
|
chapter_type = '章'
|
|
|
|
|
elif '部分' in matched_text:
|
|
|
|
|
chapter_type = '部分'
|
|
|
|
|
else:
|
|
|
|
|
chapter_type = None # 未匹配到“章”或“部分”
|
|
|
|
|
|
|
|
|
|
if chapter_type:
|
|
|
|
|
# 根据 chapter_type 动态生成 end_pattern
|
2024-10-31 20:12:08 +08:00
|
|
|
|
if not is_secondary_match:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
end_pattern = regex.compile(
|
2024-12-26 17:20:27 +08:00
|
|
|
|
rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
|
2024-12-10 17:32:08 +08:00
|
|
|
|
r'^评标(方法|办法)前附表|'
|
2024-11-01 14:28:10 +08:00
|
|
|
|
r'^附录(?:一)?[::]|'
|
|
|
|
|
r'^附件(?:一)?[::]|'
|
2024-10-31 20:12:08 +08:00
|
|
|
|
r'^附表(?:一)?[::]',
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-10-31 20:12:08 +08:00
|
|
|
|
)
|
|
|
|
|
else:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
end_pattern = regex.compile(
|
2024-12-26 17:20:27 +08:00
|
|
|
|
rf'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
|
2024-10-31 20:12:08 +08:00
|
|
|
|
)
|
2024-10-29 09:04:23 +08:00
|
|
|
|
# 根据 chapter_type 动态生成 additional_mid_pattern
|
|
|
|
|
if chapter_type == '章':
|
|
|
|
|
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
|
|
|
|
|
elif chapter_type == '部分':
|
|
|
|
|
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
|
|
|
|
|
else:
|
|
|
|
|
additional_mid_pattern = ''
|
|
|
|
|
|
|
|
|
|
# 定义基础的 mid_pattern
|
2024-12-10 17:32:08 +08:00
|
|
|
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
|
|
|
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
2025-01-07 14:03:58 +08:00
|
|
|
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)\s*须知正文\s*$'
|
2024-10-29 09:04:23 +08:00
|
|
|
|
# 合并基础模式和额外模式
|
|
|
|
|
if additional_mid_pattern:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
combined_mid_pattern = regex.compile(
|
2024-10-29 09:04:23 +08:00
|
|
|
|
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-10-29 09:04:23 +08:00
|
|
|
|
)
|
|
|
|
|
else:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
combined_mid_pattern = regex.compile(
|
2024-10-29 09:04:23 +08:00
|
|
|
|
rf'{base_mid_pattern}',
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-10-29 09:04:23 +08:00
|
|
|
|
)
|
|
|
|
|
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
|
|
|
|
|
else:
|
|
|
|
|
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
|
2024-10-31 20:12:08 +08:00
|
|
|
|
if not is_secondary_match:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
end_pattern = regex.compile(
|
2024-12-26 17:20:27 +08:00
|
|
|
|
r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
2024-12-10 17:32:08 +08:00
|
|
|
|
r'^评标(方法|办法)前附表|' # Match "评标办法前附表" at the beginning of a line
|
2024-10-31 20:12:08 +08:00
|
|
|
|
r'^附录(?:一)?[::]|' # Match "附录一" with optional "一" and colon
|
|
|
|
|
r'^附件(?:一)?[::]|' # Match "附件一" with optional "一" and colon
|
|
|
|
|
r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-10-31 20:12:08 +08:00
|
|
|
|
)
|
|
|
|
|
else:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
end_pattern = regex.compile(
|
2024-12-26 17:20:27 +08:00
|
|
|
|
rf'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$'
|
2024-10-31 20:12:08 +08:00
|
|
|
|
)
|
2024-10-30 11:11:57 +08:00
|
|
|
|
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
|
2024-10-29 09:04:23 +08:00
|
|
|
|
# 定义基础的 mid_pattern
|
|
|
|
|
base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
2024-12-10 17:32:08 +08:00
|
|
|
|
combined_mid_pattern = regex.compile(
|
2024-10-29 09:04:23 +08:00
|
|
|
|
rf'{base_mid_pattern}',
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-10-29 09:04:23 +08:00
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if start_page is not None and mid_page is None and combined_mid_pattern:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
if regex.search(combined_mid_pattern, cleaned_text):
|
2024-10-29 09:04:23 +08:00
|
|
|
|
mid_page = i
|
|
|
|
|
if start_page is not None and mid_page is not None and chapter_type:
|
2024-12-10 17:32:08 +08:00
|
|
|
|
if regex.search(end_pattern, cleaned_text):
|
2024-10-31 20:12:08 +08:00
|
|
|
|
if i > mid_page:
|
|
|
|
|
end_page = i
|
|
|
|
|
break
|
2024-10-29 09:04:23 +08:00
|
|
|
|
|
|
|
|
|
return start_page, mid_page, end_page
|
|
|
|
|
|
|
|
|
|
# 运行提取
|
|
|
|
|
start_page, mid_page, end_page = run_extraction()
|
2024-10-29 20:40:14 +08:00
|
|
|
|
if start_page is None or end_page is None or mid_page is None:
|
|
|
|
|
print(f"first: tobidders_notice 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
2024-11-01 14:28:10 +08:00
|
|
|
|
return "", ""
|
2024-09-04 17:49:05 +08:00
|
|
|
|
|
2024-12-18 10:32:24 +08:00
|
|
|
|
path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_table", common_header)
|
|
|
|
|
path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice", common_header)
|
2024-11-01 14:28:10 +08:00
|
|
|
|
return path1, path2
|
2024-09-05 15:47:29 +08:00
|
|
|
|
|
2024-10-29 20:40:14 +08:00
|
|
|
|
|
2024-12-18 10:32:24 +08:00
|
|
|
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header):
|
2024-08-29 16:37:09 +08:00
|
|
|
|
# 打开PDF文件
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
start_page = None
|
|
|
|
|
end_page = None
|
2025-01-03 17:36:23 +08:00
|
|
|
|
flag = True
|
|
|
|
|
exclusion_pattern = regex.compile(
|
|
|
|
|
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
|
2024-08-29 16:37:09 +08:00
|
|
|
|
# 遍历文档的每一页,查找开始和结束短语的位置
|
|
|
|
|
for i in range(len(pdf_document.pages)):
|
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
|
text = page.extract_text()
|
|
|
|
|
if text:
|
2024-10-29 09:04:23 +08:00
|
|
|
|
cleaned_text = clean_page_content(text, common_header)
|
2025-01-03 09:59:53 +08:00
|
|
|
|
if flag and regex.search(exclusion_pattern, cleaned_text):
|
2025-01-03 17:36:23 +08:00
|
|
|
|
flag = False
|
2024-11-01 14:28:10 +08:00
|
|
|
|
continue
|
2024-12-10 17:32:08 +08:00
|
|
|
|
if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
|
2024-10-31 20:12:08 +08:00
|
|
|
|
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
start_page = i
|
2025-01-03 17:36:23 +08:00
|
|
|
|
if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,
|
|
|
|
|
cleaned_text):
|
2024-10-21 20:35:52 +08:00
|
|
|
|
condition = i > start_page
|
2024-09-04 17:49:05 +08:00
|
|
|
|
if condition:
|
2024-09-06 15:50:52 +08:00
|
|
|
|
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
2024-09-04 17:49:05 +08:00
|
|
|
|
if is_invalid_condition or output_suffix != "invalid":
|
|
|
|
|
end_page = i
|
2024-10-29 09:04:23 +08:00
|
|
|
|
break # 找到结束页后退出循环
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
2024-10-29 09:04:23 +08:00
|
|
|
|
# 移除对 extract_pages_twice 的调用
|
2024-08-29 16:37:09 +08:00
|
|
|
|
if start_page is None or end_page is None:
|
2024-10-29 09:04:23 +08:00
|
|
|
|
print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
2024-10-29 20:40:14 +08:00
|
|
|
|
return [""]
|
2024-09-04 17:49:05 +08:00
|
|
|
|
else:
|
2024-12-18 10:32:24 +08:00
|
|
|
|
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
|
2024-10-29 20:40:14 +08:00
|
|
|
|
|
2024-11-01 14:28:10 +08:00
|
|
|
|
|
|
|
|
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
|
|
|
|
|
"""
|
|
|
|
|
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
|
|
|
|
|
此函数将在所有模式对都失败后调用。
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
if output_suffix == "qualification":
|
2024-11-11 17:12:38 +08:00
|
|
|
|
print("twice:qualificaiton!")
|
2024-11-08 19:30:19 +08:00
|
|
|
|
# 动态设置 include_keys
|
2025-01-07 14:03:58 +08:00
|
|
|
|
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力","信誉"]
|
2024-11-08 19:30:19 +08:00
|
|
|
|
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
|
2024-11-01 14:28:10 +08:00
|
|
|
|
begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])'
|
2024-11-08 19:30:19 +08:00
|
|
|
|
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
|
2024-12-10 17:32:08 +08:00
|
|
|
|
end_pattern_attachment = regex.compile(
|
2024-11-08 19:30:19 +08:00
|
|
|
|
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$',
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-11-08 19:30:19 +08:00
|
|
|
|
)
|
|
|
|
|
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
|
2024-12-10 17:32:08 +08:00
|
|
|
|
end_pattern_chapter = regex.compile(
|
2024-11-08 19:30:19 +08:00
|
|
|
|
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
|
|
|
|
r'\s*评标(?:办法|方法)前附表\s*$|'
|
|
|
|
|
r'投标人须知',
|
2024-12-10 17:32:08 +08:00
|
|
|
|
regex.MULTILINE
|
2024-11-01 14:28:10 +08:00
|
|
|
|
)
|
|
|
|
|
start_page = None
|
|
|
|
|
end_page = None
|
2024-11-08 19:30:19 +08:00
|
|
|
|
# 从指定的开始页开始遍历
|
2024-11-01 14:28:10 +08:00
|
|
|
|
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
|
|
|
|
|
text = page.extract_text()
|
|
|
|
|
if text:
|
|
|
|
|
cleaned_text = clean_page_content(text, common_header)
|
|
|
|
|
# 确定起始页,需在last_begin_index之后
|
2024-11-08 19:30:19 +08:00
|
|
|
|
if any(key in cleaned_text for key in include_keys):
|
2024-12-10 17:32:08 +08:00
|
|
|
|
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
|
2024-11-01 14:28:10 +08:00
|
|
|
|
if start_page is None:
|
|
|
|
|
start_page = i # 确保起始页不小于章节的开始页码
|
2024-11-08 19:30:19 +08:00
|
|
|
|
continue
|
|
|
|
|
# print(f"匹配到附录/附件/附表,设置起始页为: {start_page}")
|
|
|
|
|
# 确定结束页 - 附录、附件、附表等
|
|
|
|
|
if start_page is not None and end_pattern_attachment.search(cleaned_text):
|
|
|
|
|
# 额外检查当前页面是否不包含任何 include_keys
|
|
|
|
|
if not any(key in cleaned_text for key in include_keys):
|
|
|
|
|
if i > start_page:
|
|
|
|
|
end_page = i
|
|
|
|
|
# print(f"找到结束页 (附件类): {end_page}")
|
|
|
|
|
break # 找到结束页后退出循环
|
|
|
|
|
else:
|
|
|
|
|
print(f"当前页面匹配附件结束模式但包含 include_keys,继续查找。页面: {i}")
|
|
|
|
|
# 确定结束页 - 章节标题、评标附表、投标人须知等
|
|
|
|
|
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
|
2024-11-01 14:28:10 +08:00
|
|
|
|
if i > start_page:
|
|
|
|
|
end_page = i
|
2024-11-08 19:30:19 +08:00
|
|
|
|
# print(f"找到结束页 (章节标题等): {end_page}")
|
2024-11-01 14:28:10 +08:00
|
|
|
|
break # 找到结束页后退出循环
|
|
|
|
|
if start_page is None or end_page is None:
|
|
|
|
|
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
|
|
|
|
return []
|
|
|
|
|
else:
|
2025-01-03 17:36:23 +08:00
|
|
|
|
return [
|
|
|
|
|
save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
|
2024-11-01 14:28:10 +08:00
|
|
|
|
else:
|
|
|
|
|
print(f"{output_suffix} twice: 未定义的输出后缀。")
|
|
|
|
|
return []
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error in extract_pages_twice: {e}")
|
|
|
|
|
return []
|
2024-10-29 20:40:14 +08:00
|
|
|
|
|
|
|
|
|
|
2025-01-03 17:36:23 +08:00
|
|
|
|
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default"):
|
2024-12-18 10:32:24 +08:00
|
|
|
|
try:
|
|
|
|
|
# 内嵌的处理单个文件的函数
|
|
|
|
|
def process_single_file(file_path, output_folder, selection):
|
|
|
|
|
try:
|
|
|
|
|
# 获取起始和通用页眉
|
|
|
|
|
common_header, last_begin_index = get_start_and_common_header(file_path, 20)
|
|
|
|
|
if selection == 1:
|
|
|
|
|
# Selection 1: 招标公告
|
|
|
|
|
pattern_pairs = [
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
2025-01-07 14:03:58 +08:00
|
|
|
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商|应答人)须知(?:前附表)?\s*$',
|
2024-12-18 10:32:24 +08:00
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
output_suffix = "notice"
|
|
|
|
|
elif selection == 2:
|
|
|
|
|
# Selection 2: 评标办法
|
|
|
|
|
pattern_pairs = [
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
2024-12-26 17:20:27 +08:00
|
|
|
|
r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
|
|
|
|
|
r'(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
|
|
|
|
|
r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
|
2024-12-18 10:32:24 +08:00
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
# Alternative end pattern
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|[::]清标报告\s*$|'
|
|
|
|
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$|'
|
|
|
|
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
2024-12-26 17:20:27 +08:00
|
|
|
|
r'(?:(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
|
|
|
|
|
r'(?:[\u4e00-\u9fff、()()]*?)'
|
|
|
|
|
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))' # 修改的部分
|
2024-12-18 10:32:24 +08:00
|
|
|
|
r'[\u4e00-\u9fff、()()]*\s*$|'
|
2025-01-03 17:36:23 +08:00
|
|
|
|
r'^\s*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)前附表\s*$)',
|
|
|
|
|
# 第二种模式
|
2024-12-18 10:32:24 +08:00
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
output_suffix = "evaluation_method"
|
|
|
|
|
elif selection == 3:
|
|
|
|
|
# Selection 3: 资格审查条件
|
|
|
|
|
pattern_pairs = [
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|'
|
|
|
|
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
|
|
|
|
|
r'[\u4e00-\u9fff、()()]*资格[\u4e00-\u9fff、()()]*\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*|'
|
|
|
|
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
output_suffix = "qualification"
|
|
|
|
|
elif selection == 4:
|
|
|
|
|
# Selection 4: 投标人须知
|
|
|
|
|
pattern_pairs = [
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
2025-01-07 14:03:58 +08:00
|
|
|
|
r'(?:第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知|'
|
|
|
|
|
r'(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表)\s*$',
|
2024-12-18 10:32:24 +08:00
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
regex.compile(
|
2025-01-07 14:03:58 +08:00
|
|
|
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知\s*$|'
|
|
|
|
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表\s*$',
|
2024-12-18 10:32:24 +08:00
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
output_suffix = "tobidders_notice"
|
|
|
|
|
elif selection == 5:
|
2025-01-03 17:36:23 +08:00
|
|
|
|
begin_page = last_begin_index
|
|
|
|
|
invalid_path = get_invalid_file(file_path, output_folder, common_header, begin_page)
|
2024-12-18 16:01:32 +08:00
|
|
|
|
if invalid_path:
|
|
|
|
|
return [invalid_path]
|
|
|
|
|
else:
|
|
|
|
|
return [""]
|
2024-12-18 10:32:24 +08:00
|
|
|
|
else:
|
|
|
|
|
print("无效的选择:请选择1-5")
|
|
|
|
|
return [""]
|
|
|
|
|
|
|
|
|
|
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
|
|
|
|
|
is_secondary_match = (idx == 2)
|
|
|
|
|
begin_page = last_begin_index if last_begin_index != 0 else {
|
|
|
|
|
1: 0, # 公告
|
2025-01-07 17:35:11 +08:00
|
|
|
|
2: 5, # 评标
|
2024-12-18 10:32:24 +08:00
|
|
|
|
3: 5, # 资格
|
2025-01-08 17:34:50 +08:00
|
|
|
|
4: 0, # 前附表
|
2024-12-18 10:32:24 +08:00
|
|
|
|
5: 0 # 无效标
|
|
|
|
|
}.get(selection, 0)
|
|
|
|
|
|
|
|
|
|
if selection == 4: # 投标人须知
|
|
|
|
|
output_paths = list(
|
|
|
|
|
extract_pages_tobidders_notice(
|
|
|
|
|
file_path,
|
|
|
|
|
output_folder,
|
|
|
|
|
begin_pattern,
|
|
|
|
|
begin_page,
|
|
|
|
|
common_header,
|
|
|
|
|
is_secondary_match
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
if output_paths and any(os.path.isfile(f) for f in output_paths):
|
|
|
|
|
return output_paths
|
|
|
|
|
else:
|
|
|
|
|
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
|
|
|
|
|
continue
|
|
|
|
|
output_paths = extract_pages(
|
|
|
|
|
file_path,
|
|
|
|
|
output_folder,
|
|
|
|
|
begin_pattern,
|
|
|
|
|
begin_page=begin_page,
|
|
|
|
|
end_pattern=end_pattern,
|
|
|
|
|
output_suffix=output_suffix,
|
|
|
|
|
common_header=common_header
|
2024-12-17 18:44:58 +08:00
|
|
|
|
)
|
2024-12-18 10:32:24 +08:00
|
|
|
|
if output_paths and any(os.path.isfile(f) for f in output_paths):
|
|
|
|
|
# print(f"Selection {selection}: 使用模式对 {idx} 成功提取。")
|
|
|
|
|
return output_paths # 成功提取后立即返回
|
|
|
|
|
else:
|
|
|
|
|
# print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 如果所有模式对都失败,尝试回退
|
|
|
|
|
print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。")
|
|
|
|
|
fallback_result = extract_pages_twice(
|
|
|
|
|
file_path,
|
|
|
|
|
output_folder,
|
|
|
|
|
output_suffix,
|
|
|
|
|
common_header,
|
|
|
|
|
last_begin_index
|
2024-10-29 20:40:14 +08:00
|
|
|
|
)
|
2024-12-18 10:32:24 +08:00
|
|
|
|
if fallback_result and any(os.path.isfile(f) for f in fallback_result):
|
|
|
|
|
return fallback_result
|
2024-10-29 20:40:14 +08:00
|
|
|
|
else:
|
2024-12-18 10:32:24 +08:00
|
|
|
|
# 根据 selection 返回相应数量的空字符串
|
|
|
|
|
empty_returns = ["", ""] if selection == 4 else [""]
|
|
|
|
|
logger.error(f"Selection {selection}: 回退提取失败,返回空字符串。")
|
|
|
|
|
return empty_returns
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error in processing file '{file_path}': {e}")
|
|
|
|
|
return [""] * (2 if selection == 4 else 1)
|
|
|
|
|
|
|
|
|
|
# 检查是否为文件夹
|
|
|
|
|
if os.path.isdir(input_path):
|
|
|
|
|
generated_files = []
|
|
|
|
|
for file_name in os.listdir(input_path):
|
|
|
|
|
if file_name.endswith('.pdf'):
|
|
|
|
|
file_path = os.path.join(input_path, file_name)
|
|
|
|
|
files = process_single_file(file_path, output_folder, selection)
|
|
|
|
|
if files:
|
|
|
|
|
generated_files.extend(files)
|
|
|
|
|
return generated_files
|
|
|
|
|
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
|
|
|
|
|
return process_single_file(input_path, output_folder, selection)
|
2024-10-29 20:40:14 +08:00
|
|
|
|
else:
|
2024-12-18 10:32:24 +08:00
|
|
|
|
print("提供的路径既不是文件夹也不是PDF文件。")
|
|
|
|
|
return [""] * (2 if selection == 4 else 1)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error in truncate_pdf_main_goods: {e}")
|
|
|
|
|
return [""] * (2 if selection == 4 else 1)
|
2024-10-29 09:04:23 +08:00
|
|
|
|
|
2025-01-03 17:36:23 +08:00
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
if __name__ == "__main__":
|
2025-01-03 17:36:23 +08:00
|
|
|
|
logger = get_global_logger("123")
|
2024-11-01 14:28:10 +08:00
|
|
|
|
start_time = time.time()
|
2024-12-11 17:42:51 +08:00
|
|
|
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
2025-01-08 17:34:50 +08:00
|
|
|
|
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1).pdf"
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|
|
|
|
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
2024-12-11 17:42:51 +08:00
|
|
|
|
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
2025-01-07 17:35:11 +08:00
|
|
|
|
output_folder = r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp"
|
|
|
|
|
selection = 4 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
2025-01-03 17:36:23 +08:00
|
|
|
|
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
|
2024-12-18 16:01:32 +08:00
|
|
|
|
print(generated_files)
|
2024-10-31 20:12:08 +08:00
|
|
|
|
# print("生成的文件:", generated_files)
|
2024-11-01 14:28:10 +08:00
|
|
|
|
end_time = time.time()
|
|
|
|
|
print("耗时:" + str(end_time - start_time))
|