12.9 截取pdf逻辑优化

This commit is contained in:
zy123 2024-12-10 17:32:08 +08:00
parent 329f5680ec
commit d83583dc41
5 changed files with 250 additions and 305 deletions

View File

@ -12,10 +12,9 @@ def extract_text_by_page(file_path):
page = reader.pages[page_num] page = reader.pages[page_num]
text = page.extract_text() text = page.extract_text()
if text: if text:
print(text) # print(text)
cleaned_text = clean_page_content(text,common_header) cleaned_text = clean_page_content(text,common_header)
# cleaned_text=text print(cleaned_text)
# print(cleaned_text)
print("-----------------"+str(page_num)) print("-----------------"+str(page_num))
result += cleaned_text result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}") # print(f"Page {page_num + 1} Content:\n{cleaned_text}")
@ -119,8 +118,8 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__': if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf" file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件.pdf"
file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf' # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"

View File

@ -1,55 +1,26 @@
import re import regex
line_stripped="""1.采购人:陕西省某单位
2采购代理机构陕西坤硕项目管理有限公司
3供应商响应招标并且符合招标文件规定资格条件和参加投标竞
争的法人其他组织或者自然人
"""
pure_number_match = re.match(r'^(\d+)([^.\d)()、].*)', line_stripped) # 不允许出现右括号
if pure_number_match:
print("yes")
# 测试字符串 begin_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$',
regex.MULTILINE
)
# 测试示例
test_strings = [ test_strings = [
""" '投标人须知正文', # 匹配
.4评标委员会成员因缺席回避擅评标办法前附表康等原因不能继续履评标办法前附表 '”投标人须知正文', # 不匹配
责的采购人或者采购代理机构有权向相关监督管理部门通报 '” 投标人须知正文', # 不匹配
17. 投标人资格审查和投标文件符合性审查 '与 投标人须知正文', # 不匹配
17.1投标人资格审查指依据法律法规和招标文件的规定对投标文件中的资格资信证 '见 投标人须知正文', # 不匹配
明等进行审查以确定投标人是否具备投标资格投标文件符合性审查指依据招标文件的 '“ 投标人须知正文', # 不匹配
规定从投标文件的有效性完整性和对招标文件的响应程度进行审查以确定是否对招 '供应商须知正文', # 匹配
标文件的实质性要求作出响应 '谈判供应商须知正文' # 匹配
17.2投标人未通过资格审查的不得进入投标文件符合性审查 投标人未通过符合性审查的
不得进入投标文件的综合比较与评价
17.3品牌及型号必须为清单中有效期内产品并提供证明文件 否则其投标将作为无效投标
被拒绝
17.3.1如本项目使用最低评标价法 提供相同品牌产品的不同投标人以其中通过资格审查
符合性审查且报价最低的参加评标报价相同的由采购人或者采购人委托评标委员会按
照招标文件中评标办法规定的方式确定 一个参加评标的投标人其他投标无效
17.3.2如本项目使用综合评分法提供相同品牌产品且通过资格审查符合性审查的不同
投标人按一家投标人计算评审后得分最高的同品牌投标人获得中标人推荐资格评审
得分相同的由采购人或者采购人委托评标 委员会按照招标文件中评标办法规定的方式确
定一个投标人获得中标人推荐资格
17.4如一个分包内包含多种产品的 采购人或采购代理机构将在投标人须知前附表中载明
核心产品多家投标人提供的所有核心产品品牌均相同的 按第 18.3.2 条及相关法律法
规处理
17.5投标人所投产品如被列入财政部与国家主管部门颁发的节能产品目录或环境标志产
品目录应提供相关证明在评标时予以优先采购具体优先采购办见第五章评标方法
和标准如采购人所采购产品为政府强制采购的节能产品投标人所投产品的品牌及型号
必须为清单中有效期内产品并提供证明文件否则其投标将作为无效投标被拒绝
17.6投标人不良信用记录以采购人或采购代理机构查询结果为准
17.7资格审查和符合性审查标准详见第五章评标方法和标准
18. 投标文件的澄清和修正
18.1对于投标文件中含义不明确 同类问题表述不一致或者有明显文字和计算错误的内容
评标委员会应当以书面形式要求投标人作出必要的澄 说明或者补正
18.2投标人的澄清说明或者补正应当采用书面形式并加盖公章或者由法定代表人或
其授权的代表签字投标人的澄清说明或者补正不得超出投标文件的范围或者改变投标
文件的实质性内容澄清文件将作为投标文件内容的一部分
"""
] ]
# for test_string in test_strings: for s in test_strings:
# match = re.search(begin_pattern, test_string) if begin_pattern.search(s):
# if match: print(f"匹配: {s}")
# print("Matched Content:", match.group()) # 输出匹配的内容 else:
# else: print(f"不匹配: {s}")
# print("No match found.")

View File

@ -1,4 +1,4 @@
import re import regex
import os import os
import time import time
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
@ -41,7 +41,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
for page_num in range(min(start_page, total_pages)): for page_num in range(min(start_page, total_pages)):
page_text = pdf_document.pages[page_num].extract_text() page_text = pdf_document.pages[page_num].extract_text()
cleaned_text = clean_page_content(page_text, common_header) cleaned_text = clean_page_content(page_text, common_header)
if re.search(r'\s*录', cleaned_text, re.MULTILINE): if regex.search(r'\s*录', cleaned_text, regex.MULTILINE):
toc_page = page_num toc_page = page_num
break break
@ -75,7 +75,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header, def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match): is_secondary_match):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
def run_extraction(): def run_extraction():
start_page = None start_page = None
@ -87,11 +87,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
text = page.extract_text() or "" text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue continue
if start_page is None: if start_page is None:
match = re.search(begin_pattern, cleaned_text) match = regex.search(begin_pattern, cleaned_text)
if match and i > begin_page: if match and i > begin_page:
start_page = i start_page = i
matched_text = match.group(0) # 获取整个匹配的文本 matched_text = match.group(0) # 获取整个匹配的文本
@ -105,16 +105,16 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
if chapter_type: if chapter_type:
# 根据 chapter_type 动态生成 end_pattern # 根据 chapter_type 动态生成 end_pattern
if not is_secondary_match: if not is_secondary_match:
end_pattern = re.compile( end_pattern = regex.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|' rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|' r'^评标(方法|办法)前附表|'
r'^附录(?:一)?[:]|' r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|' r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]', r'^附表(?:一)?[:]',
re.MULTILINE regex.MULTILINE
) )
else: else:
end_pattern = re.compile( end_pattern = regex.compile(
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$' rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
) )
# 根据 chapter_type 动态生成 additional_mid_pattern # 根据 chapter_type 动态生成 additional_mid_pattern
@ -126,51 +126,50 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
additional_mid_pattern = '' additional_mid_pattern = ''
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = ( base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文部分)' r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
)
# 合并基础模式和额外模式 # 合并基础模式和额外模式
if additional_mid_pattern: if additional_mid_pattern:
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})', rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
re.MULTILINE regex.MULTILINE
) )
else: else:
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE regex.MULTILINE
) )
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern # print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
else: else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern # 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
if not is_secondary_match: if not is_secondary_match:
end_pattern = re.compile( end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|' r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line r'^评标(方法|办法)前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
re.MULTILINE regex.MULTILINE
) )
else: else:
end_pattern = re.compile( end_pattern = regex.compile(
rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$' rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$'
) )
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern # print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' base_mid_pattern = r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE regex.MULTILINE
) )
continue continue
if start_page is not None and mid_page is None and combined_mid_pattern: if start_page is not None and mid_page is None and combined_mid_pattern:
if re.search(combined_mid_pattern, cleaned_text): if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i mid_page = i
if start_page is not None and mid_page is not None and chapter_type: if start_page is not None and mid_page is not None and chapter_type:
if re.search(end_pattern, cleaned_text): if regex.search(end_pattern, cleaned_text):
if i > mid_page: if i > mid_page:
end_page = i end_page = i
break break
@ -195,22 +194,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
start_page = None start_page = None
end_page = None end_page = None
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
# 遍历文档的每一页,查找开始和结束短语的位置 # 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)): for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i] page = pdf_document.pages[i]
text = page.extract_text() text = page.extract_text()
if text: if text:
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
# if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" # if is_secondary_match and regex.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
if re.search(exclusion_pattern, cleaned_text): if regex.search(exclusion_pattern, cleaned_text):
continue continue
if re.search(begin_pattern, cleaned_text) and i >= begin_page: if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
if start_page and (output_suffix == "notice" or output_suffix == "invalid"): if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
pass pass
else: else:
start_page = i start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text): if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,cleaned_text):
condition = i > start_page condition = i > start_page
if condition: if condition:
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页 is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
@ -240,16 +239,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])' begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
end_pattern_attachment = re.compile( end_pattern_attachment = regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$', r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$',
re.MULTILINE regex.MULTILINE
) )
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等 # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
end_pattern_chapter = re.compile( end_pattern_chapter = regex.compile(
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|' r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'\s*评标(?:办法|方法)前附表\s*$|' r'\s*评标(?:办法|方法)前附表\s*$|'
r'投标人须知', r'投标人须知',
re.MULTILINE regex.MULTILINE
) )
start_page = None start_page = None
end_page = None end_page = None
@ -260,7 +259,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后 # 确定起始页需在last_begin_index之后
if any(key in cleaned_text for key in include_keys): if any(key in cleaned_text for key in include_keys):
if re.search(begin_pattern, cleaned_text, re.MULTILINE): if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
if start_page is None: if start_page is None:
start_page = i # 确保起始页不小于章节的开始页码 start_page = i # 确保起始页不小于章节的开始页码
continue continue
@ -305,23 +304,28 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
def get_start_and_common_header(input_path): def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path) common_header = extract_common_header(input_path)
last_begin_index = 0 last_begin_index = 0
begin_pattern = re.compile( begin_pattern = regex.compile(
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
re.MULTILINE regex.MULTILINE
) )
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path) pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages): for i, page in enumerate(pdf_document.pages):
if i > 25: if i > 10:
return common_header, 0 # 如果页码大于25直接返回last_begin_index=0 return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text() text = page.extract_text()
if text: if text:
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text): if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始 last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index return common_header, last_begin_index
return common_header, last_begin_index return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection): def truncate_pdf_main(input_path, output_folder, selection):
if os.path.isdir(input_path): if os.path.isdir(input_path):
generated_files = [] generated_files = []
@ -340,19 +344,21 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 1: 投标人须知前附表 # Selection 1: 投标人须知前附表
pattern_pairs = [ pattern_pairs = [
( (
re.compile( regex.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'), r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE),
re.compile( regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]', r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE) regex.MULTILINE)
), ),
( (
re.compile( regex.compile(
r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
re.MULTILINE), r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
re.compile( regex.MULTILINE
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]', ),
re.MULTILINE) regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
) )
] ]
output_suffix = "tobidders_notice" output_suffix = "tobidders_notice"
@ -360,23 +366,23 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 2: 评标办法 # Selection 2: 评标办法
pattern_pairs = [ pattern_pairs = [
( (
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'), regex.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))',regex.MULTILINE),
# Alternative begin pattern # Alternative begin pattern
re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+') regex.compile(r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|[:]清标报告\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',regex.MULTILINE)
# Alternative end pattern # Alternative end pattern
), ),
( (
re.compile( regex.compile(
r'(?<!见)' # 确保前面不是“见” r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 确保前面不是“见”
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号) r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法” r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾 r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表” r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
re.MULTILINE regex.MULTILINE
), ),
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) regex.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
) )
] ]
output_suffix = "evaluation_method" output_suffix = "evaluation_method"
@ -385,19 +391,19 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 3: 资格审查条件 # Selection 3: 资格审查条件
pattern_pairs = [ pattern_pairs = [
# ( # (
# re.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*', # regex.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*',
# re.MULTILINE), # regex.MULTILINE),
# re.compile( # regex.compile(
# r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*$' # r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*$'
# r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) # r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
# ), # ),
( (
re.compile( regex.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$', r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
re.MULTILINE), regex.MULTILINE),
re.compile( regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*|' r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*|'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
) )
] ]
output_suffix = "qualification" output_suffix = "qualification"
@ -405,13 +411,16 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 4: 招标公告 # Selection 4: 招标公告
pattern_pairs = [ pattern_pairs = [
( (
re.compile( regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'), r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
), ),
( (
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE), regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', regex.MULTILINE),
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE) regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
regex.MULTILINE
)
) )
] ]
output_suffix = "notice" output_suffix = "notice"
@ -420,13 +429,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 5: 无效标 # Selection 5: 无效标
pattern_pairs = [ pattern_pairs = [
( (
re.compile( regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'), r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE) regex.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', regex.MULTILINE)
), ),
( (
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE), regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', regex.MULTILINE),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE) regex.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', regex.MULTILINE)
) )
] ]
output_suffix = "invalid" output_suffix = "invalid"
@ -591,13 +600,13 @@ if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标" # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf" input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all" output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
# files=truncate_pdf_multiple(input_path,output_folder) # files=truncate_pdf_multiple(input_path,output_folder)
# print(files) # print(files)
# selections = [4, 1] # 仅处理 selection 4、1 # selections = [4, 1] # 仅处理 selection 4、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 selection = 4 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection) generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files) print(generated_files)
# print("生成的文件:", generated_files) # print("生成的文件:", generated_files)

View File

@ -1,7 +1,8 @@
import glob
import logging import logging
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库 import regex # 导入正则表达式库
import os # 用于文件和文件夹操作 import os # 用于文件和文件夹操作
from flask_app.general.clean_pdf import clean_page_content, extract_common_header from flask_app.general.clean_pdf import clean_page_content, extract_common_header
@ -16,41 +17,6 @@ def get_global_logger(unique_id):
logger = logging.getLogger(unique_id) logger = logging.getLogger(unique_id)
return logger return logger
# fitz库版本
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
def is_pdf_or_doc(filename): def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档 # 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx')) return filename.lower().endswith(('.pdf', '.doc', '.docx'))
@ -77,31 +43,33 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
return [result or ""] return [result or ""]
return [""] # 返回空字符串 return [""] # 返回空字符串
# 默认逻辑是start_page匹配上就不再设置了一般不匹配上目录的原因是设置了begin_page=5但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"): output_suffix="normal"):
start_page = None start_page = None
end_page = None end_page = None
flag=True
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text() or "" text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
if output_suffix == "tobidders_notice": if output_suffix == "tobidders_notice":
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None: if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
flag=False
continue continue
else: else:
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text): if exclusion_pattern and flag and regex.search(exclusion_pattern, cleaned_text):
flag=False
continue continue
if start_page is None and re.search(begin_pattern, cleaned_text): if start_page is None and regex.search(begin_pattern, cleaned_text):
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
start_page = i start_page = i
continue
if start_page is not None: if start_page is not None:
if output_suffix == "tobidders_notice": if output_suffix == "tobidders_notice":
if re.search(end_pattern, cleaned_text) and i > start_page: if regex.search(end_pattern, cleaned_text) and i > start_page:
end_page = i end_page = i
break break
else: else:
if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text): if regex.search(end_pattern, cleaned_text) and i > start_page and not regex.search(begin_pattern,cleaned_text):
end_page = i end_page = i
break break
return start_page, end_page return start_page, end_page
@ -115,8 +83,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
total_pages = len(pdf_document.pages) - 1 # 获取总页数 total_pages = len(pdf_document.pages) - 1 # 获取总页数
if output_suffix == "tobidders_notice": if output_suffix == "tobidders_notice":
exclusion_pattern = re.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') r'文件的构成|文件的组成|文件构成|文件组成')
start_page, mid_page, end_page = extract_pages_tobidders_notice( start_page, mid_page, end_page = extract_pages_tobidders_notice(
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
) )
@ -138,8 +106,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
else: else:
# 原有的处理逻辑保持不变 # 原有的处理逻辑保持不变
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method": if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
exclusion_pattern = re.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') r'文件的构成|文件的组成|文件构成|文件组成')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix) common_header, exclusion_pattern, output_suffix)
# 针对 selection = 6 的特殊处理 # 针对 selection = 6 的特殊处理
@ -164,62 +132,45 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
def get_patterns_for_procurement(): def get_patterns_for_procurement():
# begin_pattern = re.compile( # begin_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*', # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
# re.MULTILINE) # regex.MULTILINE)
begin_pattern = re.compile( begin_pattern = regex.compile(
r'(?<!见)' r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符 r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求” r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
r'(?:采购|需求)[\u4e00-\u9fff、()]*?)\s*$', # 或者匹配“采购”或“需求” r'(?:采购|需求)[\u4e00-\u9fff、()]*?)\s*$', # 或者匹配“采购”或“需求”
re.MULTILINE regex.MULTILINE
) )
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern return begin_pattern, end_pattern
def get_patterns_for_evaluation_method(): def get_patterns_for_evaluation_method():
# begin_pattern = re.compile( begin_pattern = regex.compile(
# r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、()]*\s*$', r'(?:(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
# re.MULTILINE r'(?:[\u4e00-\u9fff、()]*?)'
# ) r'(?=.*(?:磋商|谈判|评标|评定|评审))'
begin_pattern = re.compile( r'(?=.*(?:办法|方法|内容))'
r'(?<!见)' r'[\u4e00-\u9fff、()]*\s*$|'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'^\s*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$)', # 第二种模式
r'(?:[\u4e00-\u9fff、()]*?)' # 匹配允许的字符(中文、顿号、括号) regex.MULTILINE
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。
r'(?=.*(?:办法|方法|内容))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$', # 继续匹配允许的字符直到行尾
re.MULTILINE
) )
end_pattern = re.compile( end_pattern = regex.compile(
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern return begin_pattern, end_pattern
def get_patterns_for_notice(): def get_patterns_for_notice():
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*') begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = re.compile( end_pattern = regex.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE regex.MULTILINE
) )
return begin_pattern, end_pattern return begin_pattern, end_pattern
def get_patterns_for_notice_twice():
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
)
end_pattern = re.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
return begin_pattern, end_pattern
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header, # def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
# exclusion_pattern): # exclusion_pattern):
# start_page = None # start_page = None
@ -228,14 +179,14 @@ def get_patterns_for_notice_twice():
# for i, page in enumerate(pdf_document.pages): # for i, page in enumerate(pdf_document.pages):
# text = page.extract_text() or "" # text = page.extract_text() or ""
# cleaned_text = clean_page_content(text, common_header) # cleaned_text = clean_page_content(text, common_header)
# if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: # if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
# continue # continue
# if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: # if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
# start_page = i # start_page = i
# if start_page is not None and mid_page is None and re.search( # if start_page is not None and mid_page is None and regex.search(
# r'^\s*[(]?\s*[一1]\s*[)]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text): # r'^\s*[(]?\s*[一1]\s*[)]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text):
# mid_page = i # mid_page = i
# if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: # if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
# end_page = i # end_page = i
# break # break
# return start_page, mid_page, end_page # return start_page, mid_page, end_page
@ -248,10 +199,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
参数: 参数:
pdf_document (PDFDocument): 要处理的PDF文档对象 pdf_document (PDFDocument): 要处理的PDF文档对象
begin_pattern (str re.Pattern): 用于识别起始的正则表达式模式 begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式
begin_page (int): 开始搜索的页码 begin_page (int): 开始搜索的页码
common_header (str): 每页需要清理的公共头部文本 common_header (str): 每页需要清理的公共头部文本
exclusion_pattern (str re.Pattern): 用于排除某些页的模式 exclusion_pattern (str regex.Pattern): 用于排除某些页的模式
返回: 返回:
tuple: (start_page, mid_page, end_page) 如果成功否则 (None, None, None) tuple: (start_page, mid_page, end_page) 如果成功否则 (None, None, None)
@ -264,8 +215,8 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
如果未提供 local_end_pattern则根据匹配的章节类型动态生成 end_pattern 如果未提供 local_end_pattern则根据匹配的章节类型动态生成 end_pattern
参数: 参数:
local_begin_pattern (str re.Pattern): 用于识别起始的正则表达式模式 local_begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式
local_end_pattern (str re.Pattern, 可选): 用于识别结束的正则表达式模式 local_end_pattern (str regex.Pattern, 可选): 用于识别结束的正则表达式模式
返回: 返回:
tuple: (start_page, mid_page, end_page) tuple: (start_page, mid_page, end_page)
@ -281,12 +232,12 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
# 如果已经找到中间页,且当前页匹配排除模式,则跳过 # 如果已经找到中间页,且当前页匹配排除模式,则跳过
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue continue
# 识别起始页 # 识别起始页
if start_page is None: if start_page is None:
match = re.search(local_begin_pattern, cleaned_text) match = regex.search(local_begin_pattern, cleaned_text)
if match and i > begin_page: if match and i > begin_page:
start_page = i start_page = i
matched_text = match.group(0) # 获取整个匹配的文本 matched_text = match.group(0) # 获取整个匹配的文本
@ -302,9 +253,9 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
if chapter_type: if chapter_type:
# 根据 chapter_type 动态生成 end_pattern # 根据 chapter_type 动态生成 end_pattern
end_pattern_dynamic = re.compile( end_pattern_dynamic = regex.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+', rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
re.MULTILINE regex.MULTILINE
) )
# 根据 chapter_type 动态生成 additional_mid_pattern # 根据 chapter_type 动态生成 additional_mid_pattern
@ -317,54 +268,55 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \ base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
# 合并基础模式和额外模式 # 合并基础模式和额外模式
if additional_mid_pattern: if additional_mid_pattern:
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})', rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
re.MULTILINE regex.MULTILINE
) )
else: else:
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE regex.MULTILINE
) )
else: else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern # 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
end_pattern_dynamic = re.compile( end_pattern_dynamic = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
re.MULTILINE regex.MULTILINE
) )
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \ base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE regex.MULTILINE
) )
else: else:
# 如果提供了固定的 end_pattern则使用默认的 mid_pattern # 如果提供了固定的 end_pattern则使用默认的 mid_pattern
base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明" base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
combined_mid_pattern = re.compile( combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE regex.MULTILINE
) )
continue continue
# 识别中间页 # 识别中间页
if start_page is not None and mid_page is None and combined_mid_pattern: if start_page is not None and mid_page is None and combined_mid_pattern:
if (start_page + 1 == i) and re.search(local_begin_pattern, cleaned_text): if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
continue continue
if re.search(combined_mid_pattern, cleaned_text): if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i mid_page = i
# 识别结束页 # 识别结束页
if start_page is not None and mid_page is not None: if start_page is not None and mid_page is not None:
# 使用提供的 end_pattern 或动态生成的 end_pattern # 使用提供的 end_pattern 或动态生成的 end_pattern
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
if re.search(current_end_pattern, cleaned_text): if regex.search(current_end_pattern, cleaned_text):
if i > mid_page: if i > mid_page:
end_page = i end_page = i
break break
@ -376,21 +328,21 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
if not (start_page and mid_page and end_page): if not (start_page and mid_page and end_page):
print(f"第二次尝试 tobidders_notice!{pdf_path}") print(f"第二次尝试 tobidders_notice!")
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page) start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
if start_page and end_page and mid_page: if start_page and end_page and mid_page:
return start_page, mid_page, end_page return start_page, mid_page, end_page
else: else:
# 定义新的 begin_pattern 和 end_pattern # 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = re.compile( new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|' r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
re.MULTILINE regex.MULTILINE
) )
new_end_pattern = re.compile( new_end_pattern = regex.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
re.MULTILINE regex.MULTILINE
) )
print("第三次尝试 tobidders_notice! ") print("第三次尝试 tobidders_notice! ")
# 第二次提取尝试,使用新的模式 # 第二次提取尝试,使用新的模式
@ -401,19 +353,19 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
output_suffix = "tobidders_notice" output_suffix = "tobidders_notice"
begin_pattern = re.compile( begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+', r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
re.MULTILINE regex.MULTILINE
) )
end_pattern = re.compile( end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE # 捕获中文部分 r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
) )
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词 exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成')
# 提取第一部分 # 提取第一部分
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
if start_page1 is None or end_page1 is None: if start_page1 is None or end_page1 is None:
return "", "", "" return "", "", ""
@ -438,36 +390,36 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
exclusion_pattern, output_suffix) exclusion_pattern, output_suffix)
if end_page2 is None: if end_page2 is None:
return start_page1, end_page1, end_page1 return start_page1, end_page1, ""
return start_page1, end_page1, end_page2 return start_page1, end_page1, end_page2
def extract_pages_qualification(pdf_document, begin_page, common_header): def extract_pages_qualification(pdf_document, begin_page, common_header):
# 开始匹配模式,仅匹配“附录”、“附件”或“附表” # 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = re.compile( begin_pattern = regex.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)', r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)',
re.MULTILINE regex.MULTILINE
) )
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头 # 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
priority_pattern = re.compile( priority_pattern = regex.compile(
r'^(资格性检查|资格审查|符合性审查)', r'^(资格性检查|资格审查|符合性审查)',
re.MULTILINE regex.MULTILINE
) )
# 结束匹配模式 - 附录、附件、附表等 # 结束匹配模式 - 附录、附件、附表等
end_pattern_attachment = re.compile( end_pattern_attachment = regex.compile(
r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$', r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$',
re.MULTILINE regex.MULTILINE
) )
# 结束匹配模式 - 章节标题 # 结束匹配模式 - 章节标题
end_pattern_chapter = re.compile( end_pattern_chapter = regex.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
re.MULTILINE regex.MULTILINE
) )
print("第二次尝试:匹配附件") print("第二次尝试 qualification:匹配附件")
start_page = None start_page = None
end_page = None end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"] include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"]
@ -519,7 +471,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page): def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
try: try:
exclusion_pattern = re.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
patterns = None patterns = None
@ -531,7 +483,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3": elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()] patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "notice": elif output_suffix == "notice":
patterns = [get_patterns_for_notice(), get_patterns_for_notice_twice()] patterns = [get_patterns_for_notice()]
elif output_suffix == "qualification1": elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header) start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
if patterns: if patterns:
@ -545,11 +497,17 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
if output_suffix == "qualification1": if output_suffix == "qualification1":
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") # print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("第三次尝试资格审查:尝试提取评分办法章节...") print("第三次尝试资格审查:尝试提取评分办法章节...")
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2") base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
if len(temp) > 0: evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf")
return temp[0] if os.path.isfile(evaluation_method_file):
print(f"找到评分办法章节文件: {evaluation_method_file},直接返回。")
return evaluation_method_file
else: else:
return "" temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
if len(temp) > 0:
return temp[0]
else:
return ""
else: else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return "" return ""
@ -593,20 +551,27 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
print(f"Error in save_extracted_pages: {e}") print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串 return "" # 返回空字符串
def get_start_and_common_header(input_path): def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path) common_header = extract_common_header(input_path)
last_begin_index = 0 last_begin_index = 0
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', begin_pattern = regex.compile(
re.MULTILINE) r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path) pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages): for i, page in enumerate(pdf_document.pages):
if i > 10: if i > 10:
return common_header, 0 # 如果页码大于25直接返回last_begin_index=0 return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text() text = page.extract_text()
if text: if text:
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
if begin_pattern.search(cleaned_text) and not re.search(r'\s*录', cleaned_text): # 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始 last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index return common_header, last_begin_index
return common_header, last_begin_index return common_header, last_begin_index
@ -658,41 +623,43 @@ def process_input(input_path, output_folder, selection, output_suffix):
# 根据选择设置对应的模式和结束模式 # 根据选择设置对应的模式和结束模式
if selection == 1: if selection == 1:
begin_pattern = re.compile( begin_pattern = regex.compile(
r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', re.MULTILINE) r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) regex.MULTILINE
)
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$', regex.MULTILINE)
local_output_suffix = "notice" local_output_suffix = "notice"
elif selection == 2: elif selection == 2:
begin_pattern = re.compile( begin_pattern = regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))') r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "evaluation_method" local_output_suffix = "evaluation_method"
elif selection == 3: elif selection == 3:
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE) begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', regex.MULTILINE)
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
local_output_suffix = "qualification1" local_output_suffix = "qualification1"
elif selection == 4: elif selection == 4:
begin_pattern = re.compile( begin_pattern = regex.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)', r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
re.MULTILINE) regex.MULTILINE)
end_pattern = None end_pattern = None
local_output_suffix = "tobidders_notice" local_output_suffix = "tobidders_notice"
elif selection == 5: elif selection == 5:
begin_pattern = re.compile( begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明'
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "procurement" local_output_suffix = "procurement"
# begin_pattern = re.compile( # begin_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
# r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE) # r'^[一二三四五六七八九十百千]+、\s*采购清单', regex.MULTILINE)
# end_pattern = re.compile( # end_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
elif selection == 6: elif selection == 6:
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*') begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
local_output_suffix = "format" local_output_suffix = "format"
else: else:
print("无效的选择:请选择1-5") print("无效的选择:请选择1-5")
@ -807,17 +774,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="
# ztbfile.pdf少资格评审 包头少符合性评审 # ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__": if __name__ == "__main__":
logger = get_global_logger("123") logger = get_global_logger("123")
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf" # input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # input_path=r"C:\Users\Administrator\Desktop\fsdownload\42bd5604-fb85-43ff-821f-a1ea78fec115\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all" output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all"
files = truncate_pdf_multiple(input_path, output_folder,logger) # files = truncate_pdf_multiple(input_path, output_folder,logger)
# selections = [3,5] # selections = [3,5]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections) # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
print(files) # print(files)
# selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 selection = 3 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main(input_path, output_folder, selection) generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files) print(generated_files)

View File

@ -393,7 +393,6 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
1. 既有资格性审查内容也有符合性审查内容时 1. 既有资格性审查内容也有符合性审查内容时
2. 它们的内容在同一张表格中 2. 它们的内容在同一张表格中
3. 表中没有两个合并单元格内容为'资格性审查''符合性审查'类似的表述只有'资格性审查和符合性审查'的合并表述 3. 表中没有两个合并单元格内容为'资格性审查''符合性审查'类似的表述只有'资格性审查和符合性审查'的合并表述
4. 表头
以下为示例表格1 以下为示例表格1
| 序号 | 资格性检查和符合性检查内容 | | 序号 | 资格性检查和符合性检查内容 |
@ -462,7 +461,7 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
输出要求 输出要求
1.请以json格式给出外层为'资格性和符合性审查'最内层的值需要用列表包裹 1.请以json格式给出外层为'资格性和符合性审查'最内层的值需要用列表包裹
2.一层嵌套内的键需要总结分类为某类评审因素或是直接使用原文中的评审因素字段标题 2.一层嵌套内的键需要总结分类为某类评审因素或是直接使用原文中的评审因素字段标题
3.你的回答要与原文完全一致 3.你的回答要与原文完全一致若审查标准在表格中那么单元格内的内容基本都要涵盖不要遗漏作为键值中的字符串列表项
4.最大细分为二层嵌套即可 4.最大细分为二层嵌套即可
输出示例 输出示例
{ {