11.8采购要求更加完整

This commit is contained in:
zy123 2024-11-08 16:50:52 +08:00
parent 5db9dd3a65
commit 1e4949e83b
3 changed files with 52 additions and 45 deletions

View File

@ -245,8 +245,8 @@ def outer_post_processing(combined_data, includes,good_list):
extracted_info = inner_post_processing(base_info)
# 将 '基础信息' 保留在处理后的数据中
processed_data["基础信息"] = base_info
# 提取 '采购要求' 下的 '技术要求'
tech_requirements = get_nested(base_info, ["采购要求", "技术要"], {})
# 提取 '采购要求' 下的 '采购需求'
tech_requirements = get_nested(base_info, ["采购要求", "采购需"], {})
if tech_requirements:
procurement_reqs = extract_matching_keys(tech_requirements,good_list)
else:

View File

@ -96,7 +96,7 @@ def extract_text_by_page(file_path):
if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile_procurement.pdf'
file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"

View File

@ -75,8 +75,6 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
return [""] # 返回空字符串
# 默认逻辑是start_page匹配上就不再设置了一般不匹配上目录的原因是设置了begin_page=5但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"):
@ -180,23 +178,6 @@ def get_patterns_for_evaluation_method():
end_pattern = re.compile(
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
return begin_pattern, end_pattern
def get_patterns_for_qualification():
# begin_pattern = re.compile(
# r'^(?:附录(?:一|1)?[:]|附件(?:一|1)?[:]|附表(?:一|1)?[:]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
# re.MULTILINE
# )
begin_pattern = re.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?|资格性检查|资格审查表|符合性审查表)', #目前可以匹配 附录3,因为允许了'附录'匹配
re.MULTILINE
)
end_pattern = re.compile(
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$|'
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', #更宽松的匹配
re.MULTILINE
)
return begin_pattern, end_pattern
def get_patterns_for_notice():
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = re.compile(
@ -489,32 +470,62 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
return start_page1, end_page1,end_page2
def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
import re
def extract_pages_qualification(pdf_document, begin_page, common_header):
begin_pattern = re.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)',
re.MULTILINE
)
priority_pattern = re.compile(
r'^(资格性检查|资格审查|符合性审查)',
re.MULTILINE
)
end_pattern = re.compile(
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
re.MULTILINE
)
print("第二次尝试:匹配附件")
start_page = None
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查"]
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"]
exclude_keywords = ["声明函", "承诺函"]
# 从章节开始后的位置进行检查
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后
# 优先检查是否匹配优先模式
priority_match = priority_pattern.search(cleaned_text)
if priority_match and start_page is None:
start_page = i
# print(f"匹配到优先模式,设置起始页为: {start_page}")
else:
# 如果未匹配优先模式,则按照一般模式进行判断
if (
any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None
):
if begin_pattern.search(cleaned_text):
print("第二次尝试:匹配附件")
start_page = i
print(f"匹配到附录等模式,设置起始页为: {start_page}")
# 确定结束页
if start_page is not None and end_pattern.search(cleaned_text):
if i > start_page:
end_page = i
# print(f"找到结束页: {end_page}")
break # 找到结束页后退出循环
return start_page, end_page
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
try:
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
@ -527,16 +538,12 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
patterns = [get_patterns_for_procurement()]
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "qualification1":
patterns = [get_patterns_for_qualification()]
elif output_suffix == "notice":
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
if patterns:
for pattern_pair in patterns:
if output_suffix=="qualification1":
start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header)
else:
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header,exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None:
@ -803,17 +810,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
#ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1).pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output4"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3"
# files = truncate_pdf_multiple(input_path, output_folder)
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files)
selection = 4# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
selection = 3# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)