11.8采购要求更加完整
This commit is contained in:
parent
5db9dd3a65
commit
1e4949e83b
@ -245,8 +245,8 @@ def outer_post_processing(combined_data, includes,good_list):
|
|||||||
extracted_info = inner_post_processing(base_info)
|
extracted_info = inner_post_processing(base_info)
|
||||||
# 将 '基础信息' 保留在处理后的数据中
|
# 将 '基础信息' 保留在处理后的数据中
|
||||||
processed_data["基础信息"] = base_info
|
processed_data["基础信息"] = base_info
|
||||||
# 提取 '采购要求' 下的 '技术要求'
|
# 提取 '采购要求' 下的 '采购需求'
|
||||||
tech_requirements = get_nested(base_info, ["采购要求", "技术要求"], {})
|
tech_requirements = get_nested(base_info, ["采购要求", "采购需求"], {})
|
||||||
if tech_requirements:
|
if tech_requirements:
|
||||||
procurement_reqs = extract_matching_keys(tech_requirements,good_list)
|
procurement_reqs = extract_matching_keys(tech_requirements,good_list)
|
||||||
else:
|
else:
|
||||||
|
@ -96,7 +96,7 @@ def extract_text_by_page(file_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile_procurement.pdf'
|
file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
|
@ -75,8 +75,6 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
|
|||||||
return [""] # 返回空字符串
|
return [""] # 返回空字符串
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
|
# 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
|
||||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||||
output_suffix="normal"):
|
output_suffix="normal"):
|
||||||
@ -180,23 +178,6 @@ def get_patterns_for_evaluation_method():
|
|||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
def get_patterns_for_qualification():
|
|
||||||
# begin_pattern = re.compile(
|
|
||||||
# r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
|
|
||||||
# re.MULTILINE
|
|
||||||
# )
|
|
||||||
begin_pattern = re.compile(
|
|
||||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?|资格性检查|资格审查表|符合性审查表)', #目前可以匹配 附录3,因为允许了'附录'匹配
|
|
||||||
re.MULTILINE
|
|
||||||
)
|
|
||||||
end_pattern = re.compile(
|
|
||||||
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|'
|
|
||||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', #更宽松的匹配
|
|
||||||
re.MULTILINE
|
|
||||||
)
|
|
||||||
return begin_pattern, end_pattern
|
|
||||||
def get_patterns_for_notice():
|
def get_patterns_for_notice():
|
||||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
|
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
@ -489,31 +470,61 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
|
|||||||
|
|
||||||
return start_page1, end_page1,end_page2
|
return start_page1, end_page1,end_page2
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||||
|
begin_pattern = re.compile(
|
||||||
|
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
priority_pattern = re.compile(
|
||||||
|
r'^(资格性检查|资格审查|符合性审查)',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
end_pattern = re.compile(
|
||||||
|
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。
|
||||||
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
print("第二次尝试:匹配附件")
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
include_keywords = ["资格审查", "资质审查", "符合性审查"]
|
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"]
|
||||||
exclude_keywords = ["声明函", "承诺函"]
|
exclude_keywords = ["声明函", "承诺函"]
|
||||||
|
|
||||||
# 从章节开始后的位置进行检查
|
# 从章节开始后的位置进行检查
|
||||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
# 确定起始页,需在last_begin_index之后
|
|
||||||
if (
|
# 优先检查是否匹配优先模式
|
||||||
any(keyword in cleaned_text for keyword in include_keywords) and
|
priority_match = priority_pattern.search(cleaned_text)
|
||||||
all(keyword not in cleaned_text for keyword in exclude_keywords) and
|
if priority_match and start_page is None:
|
||||||
start_page is None
|
start_page = i
|
||||||
):
|
# print(f"匹配到优先模式,设置起始页为: {start_page}")
|
||||||
if begin_pattern.search(cleaned_text):
|
else:
|
||||||
print("第二次尝试:匹配附件")
|
# 如果未匹配优先模式,则按照一般模式进行判断
|
||||||
start_page = i
|
if (
|
||||||
|
any(keyword in cleaned_text for keyword in include_keywords) and
|
||||||
|
all(keyword not in cleaned_text for keyword in exclude_keywords) and
|
||||||
|
start_page is None
|
||||||
|
):
|
||||||
|
if begin_pattern.search(cleaned_text):
|
||||||
|
start_page = i
|
||||||
|
print(f"匹配到附录等模式,设置起始页为: {start_page}")
|
||||||
|
|
||||||
# 确定结束页
|
# 确定结束页
|
||||||
if start_page is not None and end_pattern.search(cleaned_text):
|
if start_page is not None and end_pattern.search(cleaned_text):
|
||||||
if i > start_page:
|
if i > start_page:
|
||||||
end_page = i
|
end_page = i
|
||||||
|
# print(f"找到结束页: {end_page}")
|
||||||
break # 找到结束页后退出循环
|
break # 找到结束页后退出循环
|
||||||
return start_page,end_page
|
|
||||||
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
||||||
try:
|
try:
|
||||||
@ -527,17 +538,13 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
|
|||||||
patterns = [get_patterns_for_procurement()]
|
patterns = [get_patterns_for_procurement()]
|
||||||
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
|
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
|
||||||
patterns = [get_patterns_for_evaluation_method()]
|
patterns = [get_patterns_for_evaluation_method()]
|
||||||
elif output_suffix == "qualification1":
|
|
||||||
patterns = [get_patterns_for_qualification()]
|
|
||||||
elif output_suffix == "notice":
|
elif output_suffix == "notice":
|
||||||
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
|
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
|
||||||
|
elif output_suffix == "qualification1":
|
||||||
|
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
|
||||||
if patterns:
|
if patterns:
|
||||||
for pattern_pair in patterns:
|
for pattern_pair in patterns:
|
||||||
if output_suffix=="qualification1":
|
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
||||||
start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header)
|
|
||||||
else:
|
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
|
||||||
common_header,exclusion_pattern, output_suffix)
|
common_header,exclusion_pattern, output_suffix)
|
||||||
if start_page is not None and end_page is not None:
|
if start_page is not None and end_page is not None:
|
||||||
break
|
break
|
||||||
@ -803,17 +810,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
|||||||
|
|
||||||
#ztbfile.pdf少资格评审 包头少符合性评审
|
#ztbfile.pdf少资格评审 包头少符合性评审
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1).pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output4"
|
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
||||||
# files = truncate_pdf_multiple(input_path, output_folder)
|
# files = truncate_pdf_multiple(input_path, output_folder)
|
||||||
# selections = [1,4]
|
# selections = [1,4]
|
||||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||||
# print(files)
|
# print(files)
|
||||||
selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
selection = 3# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
print(generated_files)
|
Loading…
x
Reference in New Issue
Block a user