12.9 截取pdf逻辑优化

This commit is contained in:
zy123 2024-12-10 17:32:08 +08:00
parent 329f5680ec
commit d83583dc41
5 changed files with 250 additions and 305 deletions

View File

@ -12,10 +12,9 @@ def extract_text_by_page(file_path):
page = reader.pages[page_num]
text = page.extract_text()
if text:
print(text)
# print(text)
cleaned_text = clean_page_content(text,common_header)
# cleaned_text=text
# print(cleaned_text)
print(cleaned_text)
print("-----------------"+str(page_num))
result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
@ -119,8 +118,8 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件.pdf"
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"

View File

@ -1,55 +1,26 @@
import re
line_stripped="""1.采购人:陕西省某单位
2采购代理机构陕西坤硕项目管理有限公司
3供应商响应招标并且符合招标文件规定资格条件和参加投标竞
争的法人其他组织或者自然人
"""
pure_number_match = re.match(r'^(\d+)([^.\d)()、].*)', line_stripped) # 不允许出现右括号
if pure_number_match:
print("yes")
import regex
# 测试字符串
begin_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$',
regex.MULTILINE
)
# 测试示例
test_strings = [
"""
.4评标委员会成员因缺席回避擅评标办法前附表康等原因不能继续履评标办法前附表
责的采购人或者采购代理机构有权向相关监督管理部门通报
17. 投标人资格审查和投标文件符合性审查
17.1投标人资格审查指依据法律法规和招标文件的规定对投标文件中的资格资信证
明等进行审查以确定投标人是否具备投标资格投标文件符合性审查指依据招标文件的
规定从投标文件的有效性完整性和对招标文件的响应程度进行审查以确定是否对招
标文件的实质性要求作出响应
17.2投标人未通过资格审查的不得进入投标文件符合性审查 投标人未通过符合性审查的
不得进入投标文件的综合比较与评价
17.3品牌及型号必须为清单中有效期内产品并提供证明文件 否则其投标将作为无效投标
被拒绝
17.3.1如本项目使用最低评标价法 提供相同品牌产品的不同投标人以其中通过资格审查
符合性审查且报价最低的参加评标报价相同的由采购人或者采购人委托评标委员会按
照招标文件中评标办法规定的方式确定 一个参加评标的投标人其他投标无效
17.3.2如本项目使用综合评分法提供相同品牌产品且通过资格审查符合性审查的不同
投标人按一家投标人计算评审后得分最高的同品牌投标人获得中标人推荐资格评审
得分相同的由采购人或者采购人委托评标 委员会按照招标文件中评标办法规定的方式确
定一个投标人获得中标人推荐资格
17.4如一个分包内包含多种产品的 采购人或采购代理机构将在投标人须知前附表中载明
核心产品多家投标人提供的所有核心产品品牌均相同的 按第 18.3.2 条及相关法律法
规处理
17.5投标人所投产品如被列入财政部与国家主管部门颁发的节能产品目录或环境标志产
品目录应提供相关证明在评标时予以优先采购具体优先采购办见第五章评标方法
和标准如采购人所采购产品为政府强制采购的节能产品投标人所投产品的品牌及型号
必须为清单中有效期内产品并提供证明文件否则其投标将作为无效投标被拒绝
17.6投标人不良信用记录以采购人或采购代理机构查询结果为准
17.7资格审查和符合性审查标准详见第五章评标方法和标准
18. 投标文件的澄清和修正
18.1对于投标文件中含义不明确 同类问题表述不一致或者有明显文字和计算错误的内容
评标委员会应当以书面形式要求投标人作出必要的澄 说明或者补正
18.2投标人的澄清说明或者补正应当采用书面形式并加盖公章或者由法定代表人或
其授权的代表签字投标人的澄清说明或者补正不得超出投标文件的范围或者改变投标
文件的实质性内容澄清文件将作为投标文件内容的一部分
"""
'投标人须知正文', # 匹配
'”投标人须知正文', # 不匹配
'” 投标人须知正文', # 不匹配
'与 投标人须知正文', # 不匹配
'见 投标人须知正文', # 不匹配
'“ 投标人须知正文', # 不匹配
'供应商须知正文', # 匹配
'谈判供应商须知正文' # 匹配
]
# for test_string in test_strings:
# match = re.search(begin_pattern, test_string)
# if match:
# print("Matched Content:", match.group()) # 输出匹配的内容
# else:
# print("No match found.")
for s in test_strings:
if begin_pattern.search(s):
print(f"匹配: {s}")
else:
print(f"不匹配: {s}")

View File

@ -1,4 +1,4 @@
import re
import regex
import os
import time
from PyPDF2 import PdfReader, PdfWriter
@ -41,7 +41,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
for page_num in range(min(start_page, total_pages)):
page_text = pdf_document.pages[page_num].extract_text()
cleaned_text = clean_page_content(page_text, common_header)
if re.search(r'\s*录', cleaned_text, re.MULTILINE):
if regex.search(r'\s*录', cleaned_text, regex.MULTILINE):
toc_page = page_num
break
@ -75,7 +75,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match):
pdf_document = PdfReader(pdf_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
def run_extraction():
start_page = None
@ -87,11 +87,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue
if start_page is None:
match = re.search(begin_pattern, cleaned_text)
match = regex.search(begin_pattern, cleaned_text)
if match and i > begin_page:
start_page = i
matched_text = match.group(0) # 获取整个匹配的文本
@ -105,16 +105,16 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
if chapter_type:
# 根据 chapter_type 动态生成 end_pattern
if not is_secondary_match:
end_pattern = re.compile(
end_pattern = regex.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|'
r'^评标(方法|办法)前附表|'
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]',
re.MULTILINE
regex.MULTILINE
)
else:
end_pattern = re.compile(
end_pattern = regex.compile(
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
)
# 根据 chapter_type 动态生成 additional_mid_pattern
@ -126,51 +126,50 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
additional_mid_pattern = ''
# 定义基础的 mid_pattern
base_mid_pattern = (
r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文部分)'
)
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
# 合并基础模式和额外模式
if additional_mid_pattern:
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
re.MULTILINE
regex.MULTILINE
)
else:
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
re.MULTILINE
regex.MULTILINE
)
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
if not is_secondary_match:
end_pattern = re.compile(
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line
r'^评标(方法|办法)前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
re.MULTILINE
regex.MULTILINE
)
else:
end_pattern = re.compile(
end_pattern = regex.compile(
rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$'
)
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
re.MULTILINE
regex.MULTILINE
)
continue
if start_page is not None and mid_page is None and combined_mid_pattern:
if re.search(combined_mid_pattern, cleaned_text):
if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i
if start_page is not None and mid_page is not None and chapter_type:
if re.search(end_pattern, cleaned_text):
if regex.search(end_pattern, cleaned_text):
if i > mid_page:
end_page = i
break
@ -195,22 +194,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
# 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
if re.search(exclusion_pattern, cleaned_text):
# if is_secondary_match and regex.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
if regex.search(exclusion_pattern, cleaned_text):
continue
if re.search(begin_pattern, cleaned_text) and i >= begin_page:
if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
pass
else:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text):
if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,cleaned_text):
condition = i > start_page
if condition:
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
@ -240,16 +239,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
end_pattern_attachment = re.compile(
end_pattern_attachment = regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$',
re.MULTILINE
regex.MULTILINE
)
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
end_pattern_chapter = re.compile(
end_pattern_chapter = regex.compile(
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'\s*评标(?:办法|方法)前附表\s*$|'
r'投标人须知',
re.MULTILINE
regex.MULTILINE
)
start_page = None
end_page = None
@ -260,7 +259,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后
if any(key in cleaned_text for key in include_keys):
if re.search(begin_pattern, cleaned_text, re.MULTILINE):
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
if start_page is None:
start_page = i # 确保起始页不小于章节的开始页码
continue
@ -305,23 +304,28 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = re.compile(
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',
re.MULTILINE
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 25:
return common_header, 0 # 如果页码大于25直接返回last_begin_index=0
if i > 10:
return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection):
if os.path.isdir(input_path):
generated_files = []
@ -340,19 +344,21 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 1: 投标人须知前附表
pattern_pairs = [
(
re.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'),
re.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
regex.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
),
(
re.compile(
r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
re.MULTILINE),
re.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
regex.MULTILINE
),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
)
]
output_suffix = "tobidders_notice"
@ -360,23 +366,23 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 2: 评标办法
pattern_pairs = [
(
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
regex.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))',regex.MULTILINE),
# Alternative begin pattern
re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
regex.compile(r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|[:]清标报告\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',regex.MULTILINE)
# Alternative end pattern
),
(
re.compile(
r'(?<!见)' # 确保前面不是“见”
regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 确保前面不是“见”
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
re.MULTILINE
regex.MULTILINE
),
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
regex.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
)
]
output_suffix = "evaluation_method"
@ -385,19 +391,19 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 3: 资格审查条件
pattern_pairs = [
# (
# re.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*',
# re.MULTILINE),
# re.compile(
# regex.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*',
# regex.MULTILINE),
# regex.compile(
# r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*$'
# r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
# r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
# ),
(
re.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
re.MULTILINE),
re.compile(
regex.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
regex.MULTILINE),
regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*|'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
)
]
output_suffix = "qualification"
@ -405,13 +411,16 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 4: 招标公告
pattern_pairs = [
(
re.compile(
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE),
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE)
regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', regex.MULTILINE),
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
regex.MULTILINE
)
)
]
output_suffix = "notice"
@ -420,13 +429,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 5: 无效标
pattern_pairs = [
(
re.compile(
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
regex.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', regex.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', regex.MULTILINE),
regex.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', regex.MULTILINE)
)
]
output_suffix = "invalid"
@ -591,13 +600,13 @@ if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
# files=truncate_pdf_multiple(input_path,output_folder)
# print(files)
# selections = [4, 1] # 仅处理 selection 4、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
selection = 4 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)
# print("生成的文件:", generated_files)

View File

@ -1,7 +1,8 @@
import glob
import logging
from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import regex # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
@ -16,41 +17,6 @@ def get_global_logger(unique_id):
logger = logging.getLogger(unique_id)
return logger
# fitz库版本
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
@ -77,31 +43,33 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
return [result or ""]
return [""] # 返回空字符串
# 默认逻辑是start_page匹配上就不再设置了一般不匹配上目录的原因是设置了begin_page=5但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"):
start_page = None
end_page = None
flag=True
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if output_suffix == "tobidders_notice":
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None:
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
flag=False
continue
else:
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
if exclusion_pattern and flag and regex.search(exclusion_pattern, cleaned_text):
flag=False
continue
if start_page is None and re.search(begin_pattern, cleaned_text):
if start_page is None and regex.search(begin_pattern, cleaned_text):
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
start_page = i
continue
if start_page is not None:
if output_suffix == "tobidders_notice":
if re.search(end_pattern, cleaned_text) and i > start_page:
if regex.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
else:
if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text):
if regex.search(end_pattern, cleaned_text) and i > start_page and not regex.search(begin_pattern,cleaned_text):
end_page = i
break
return start_page, end_page
@ -115,8 +83,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
total_pages = len(pdf_document.pages) - 1 # 获取总页数
if output_suffix == "tobidders_notice":
exclusion_pattern = re.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|文件构成|文件组成')
start_page, mid_page, end_page = extract_pages_tobidders_notice(
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
)
@ -138,8 +106,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
else:
# 原有的处理逻辑保持不变
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
exclusion_pattern = re.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|文件构成|文件组成')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix)
# 针对 selection = 6 的特殊处理
@ -164,62 +132,45 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
def get_patterns_for_procurement():
# begin_pattern = re.compile(
# begin_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
# re.MULTILINE)
begin_pattern = re.compile(
r'(?<!见)'
# regex.MULTILINE)
begin_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
r'(?:采购|需求)[\u4e00-\u9fff、()]*?)\s*$', # 或者匹配“采购”或“需求”
re.MULTILINE
regex.MULTILINE
)
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def get_patterns_for_evaluation_method():
# begin_pattern = re.compile(
# r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、()]*\s*$',
# re.MULTILINE
# )
begin_pattern = re.compile(
r'(?<!见)'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'(?:[\u4e00-\u9fff、()]*?)' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。
r'(?=.*(?:办法|方法|内容))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$', # 继续匹配允许的字符直到行尾
re.MULTILINE
begin_pattern = regex.compile(
r'(?:(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
r'(?:[\u4e00-\u9fff、()]*?)'
r'(?=.*(?:磋商|谈判|评标|评定|评审))'
r'(?=.*(?:办法|方法|内容))'
r'[\u4e00-\u9fff、()]*\s*$|'
r'^\s*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$)', # 第二种模式
regex.MULTILINE
)
end_pattern = re.compile(
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
end_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def get_patterns_for_notice():
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = re.compile(
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = regex.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
regex.MULTILINE
)
return begin_pattern, end_pattern
def get_patterns_for_notice_twice():
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
)
end_pattern = re.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
return begin_pattern, end_pattern
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
# exclusion_pattern):
# start_page = None
@ -228,14 +179,14 @@ def get_patterns_for_notice_twice():
# for i, page in enumerate(pdf_document.pages):
# text = page.extract_text() or ""
# cleaned_text = clean_page_content(text, common_header)
# if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
# if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
# continue
# if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
# if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
# start_page = i
# if start_page is not None and mid_page is None and re.search(
# if start_page is not None and mid_page is None and regex.search(
# r'^\s*[(]?\s*[一1]\s*[)]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text):
# mid_page = i
# if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
# end_page = i
# break
# return start_page, mid_page, end_page
@ -248,10 +199,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
参数:
pdf_document (PDFDocument): 要处理的PDF文档对象
begin_pattern (str re.Pattern): 用于识别起始的正则表达式模式
begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式
begin_page (int): 开始搜索的页码
common_header (str): 每页需要清理的公共头部文本
exclusion_pattern (str re.Pattern): 用于排除某些页的模式
exclusion_pattern (str regex.Pattern): 用于排除某些页的模式
返回:
tuple: (start_page, mid_page, end_page) 如果成功否则 (None, None, None)
@ -264,8 +215,8 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
如果未提供 local_end_pattern则根据匹配的章节类型动态生成 end_pattern
参数:
local_begin_pattern (str re.Pattern): 用于识别起始的正则表达式模式
local_end_pattern (str re.Pattern, 可选): 用于识别结束的正则表达式模式
local_begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式
local_end_pattern (str regex.Pattern, 可选): 用于识别结束的正则表达式模式
返回:
tuple: (start_page, mid_page, end_page)
@ -281,12 +232,12 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
cleaned_text = clean_page_content(text, common_header)
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue
# 识别起始页
if start_page is None:
match = re.search(local_begin_pattern, cleaned_text)
match = regex.search(local_begin_pattern, cleaned_text)
if match and i > begin_page:
start_page = i
matched_text = match.group(0) # 获取整个匹配的文本
@ -302,9 +253,9 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
if chapter_type:
# 根据 chapter_type 动态生成 end_pattern
end_pattern_dynamic = re.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+',
re.MULTILINE
end_pattern_dynamic = regex.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
regex.MULTILINE
)
# 根据 chapter_type 动态生成 additional_mid_pattern
@ -317,54 +268,55 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
# 合并基础模式和额外模式
if additional_mid_pattern:
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
re.MULTILINE
regex.MULTILINE
)
else:
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
re.MULTILINE
regex.MULTILINE
)
else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
end_pattern_dynamic = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
end_pattern_dynamic = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
regex.MULTILINE
)
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
re.MULTILINE
regex.MULTILINE
)
else:
# 如果提供了固定的 end_pattern则使用默认的 mid_pattern
base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
combined_mid_pattern = re.compile(
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
re.MULTILINE
regex.MULTILINE
)
continue
# 识别中间页
if start_page is not None and mid_page is None and combined_mid_pattern:
if (start_page + 1 == i) and re.search(local_begin_pattern, cleaned_text):
if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
continue
if re.search(combined_mid_pattern, cleaned_text):
if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i
# 识别结束页
if start_page is not None and mid_page is not None:
# 使用提供的 end_pattern 或动态生成的 end_pattern
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
if re.search(current_end_pattern, cleaned_text):
if regex.search(current_end_pattern, cleaned_text):
if i > mid_page:
end_page = i
break
@ -376,21 +328,21 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
if not (start_page and mid_page and end_page):
print(f"第二次尝试 tobidders_notice!{pdf_path}")
print(f"第二次尝试 tobidders_notice!")
pdf_document = PdfReader(pdf_path)
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
if start_page and end_page and mid_page:
return start_page, mid_page, end_page
else:
# 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = re.compile(
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
re.MULTILINE
regex.MULTILINE
)
new_end_pattern = re.compile(
new_end_pattern = regex.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
re.MULTILINE
regex.MULTILINE
)
print("第三次尝试 tobidders_notice! ")
# 第二次提取尝试,使用新的模式
@ -401,19 +353,19 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
output_suffix = "tobidders_notice"
begin_pattern = re.compile(
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
re.MULTILINE
regex.MULTILINE
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE # 捕获中文部分
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
)
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成')
# 提取第一部分
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
if start_page1 is None or end_page1 is None:
return "", "", ""
@ -438,36 +390,36 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
exclusion_pattern, output_suffix)
if end_page2 is None:
return start_page1, end_page1, end_page1
return start_page1, end_page1, ""
return start_page1, end_page1, end_page2
def extract_pages_qualification(pdf_document, begin_page, common_header):
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = re.compile(
begin_pattern = regex.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)',
re.MULTILINE
regex.MULTILINE
)
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
priority_pattern = re.compile(
priority_pattern = regex.compile(
r'^(资格性检查|资格审查|符合性审查)',
re.MULTILINE
regex.MULTILINE
)
# 结束匹配模式 - 附录、附件、附表等
end_pattern_attachment = re.compile(
end_pattern_attachment = regex.compile(
r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$',
re.MULTILINE
regex.MULTILINE
)
# 结束匹配模式 - 章节标题
end_pattern_chapter = re.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
re.MULTILINE
end_pattern_chapter = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第二次尝试:匹配附件")
print("第二次尝试 qualification:匹配附件")
start_page = None
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"]
@ -519,7 +471,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
try:
exclusion_pattern = re.compile(
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
pdf_document = PdfReader(pdf_path)
patterns = None
@ -531,7 +483,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "notice":
patterns = [get_patterns_for_notice(), get_patterns_for_notice_twice()]
patterns = [get_patterns_for_notice()]
elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
if patterns:
@ -545,11 +497,17 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
if output_suffix == "qualification1":
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("第三次尝试资格审查:尝试提取评分办法章节...")
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
if len(temp) > 0:
return temp[0]
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf")
if os.path.isfile(evaluation_method_file):
print(f"找到评分办法章节文件: {evaluation_method_file},直接返回。")
return evaluation_method_file
else:
return ""
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
if len(temp) > 0:
return temp[0]
else:
return ""
else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
@ -593,20 +551,27 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
re.MULTILINE)
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 10:
return common_header, 0 # 如果页码大于25直接返回last_begin_index=0
return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
if begin_pattern.search(cleaned_text) and not re.search(r'\s*录', cleaned_text):
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
@ -658,41 +623,43 @@ def process_input(input_path, output_folder, selection, output_suffix):
# 根据选择设置对应的模式和结束模式
if selection == 1:
begin_pattern = re.compile(
r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', re.MULTILINE)
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$', regex.MULTILINE)
local_output_suffix = "notice"
elif selection == 2:
begin_pattern = re.compile(
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "evaluation_method"
elif selection == 3:
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', regex.MULTILINE)
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
local_output_suffix = "qualification1"
elif selection == 4:
begin_pattern = re.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)',
re.MULTILINE)
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
regex.MULTILINE)
end_pattern = None
local_output_suffix = "tobidders_notice"
elif selection == 5:
begin_pattern = re.compile(
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明'
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "procurement"
# begin_pattern = re.compile(
# begin_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
# r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
# end_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
# r'^[一二三四五六七八九十百千]+、\s*采购清单', regex.MULTILINE)
# end_pattern = regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
elif selection == 6:
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
local_output_suffix = "format"
else:
print("无效的选择:请选择1-5")
@ -807,17 +774,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="
# ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf"
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
# input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# input_path=r"C:\Users\Administrator\Desktop\fsdownload\42bd5604-fb85-43ff-821f-a1ea78fec115\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all"
files = truncate_pdf_multiple(input_path, output_folder,logger)
# files = truncate_pdf_multiple(input_path, output_folder,logger)
# selections = [3,5]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
print(files)
# selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)
# print(files)
selection = 3 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)

View File

@ -393,7 +393,6 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
1. 既有资格性审查内容也有符合性审查内容时
2. 它们的内容在同一张表格中
3. 表中没有两个合并单元格内容为'资格性审查''符合性审查'类似的表述只有'资格性审查和符合性审查'的合并表述
4. 表头
以下为示例表格1
| 序号 | 资格性检查和符合性检查内容 |
@ -462,7 +461,7 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
输出要求
1.请以json格式给出外层为'资格性和符合性审查'最内层的值需要用列表包裹
2.一层嵌套内的键需要总结分类为某类评审因素或是直接使用原文中的评审因素字段标题
3.你的回答要与原文完全一致
3.你的回答要与原文完全一致若审查标准在表格中那么单元格内的内容基本都要涵盖不要遗漏作为键值中的字符串列表项
4.最大细分为二层嵌套即可
输出示例
{