1.6 商务带星要求在标题中->传递到具体要求中

This commit is contained in:
zy123 2025-01-07 14:03:58 +08:00
parent 691883cc99
commit 72d787c899
2 changed files with 16 additions and 17 deletions

View File

@ -68,7 +68,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \ base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \ r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$' r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)\s*须知正文\s*$'
# 合并基础模式和额外模式 # 合并基础模式和额外模式
if additional_mid_pattern: if additional_mid_pattern:
combined_mid_pattern = regex.compile( combined_mid_pattern = regex.compile(
@ -176,8 +176,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
if output_suffix == "qualification": if output_suffix == "qualification":
print("twice:qualificaiton!") print("twice:qualificaiton!")
# 动态设置 include_keys # 动态设置 include_keys
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力", include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力","信誉"]
"信誉"]
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])' begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
@ -260,7 +259,7 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
regex.MULTILINE regex.MULTILINE
), ),
regex.compile( regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$', r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商|应答人)须知(?:前附表)?\s*$',
regex.MULTILINE regex.MULTILINE
) )
) )
@ -324,8 +323,8 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
pattern_pairs = [ pattern_pairs = [
( (
regex.compile( regex.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知|' r'(?:第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知|'
r'(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知前附表)\s*$', r'(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表)\s*$',
regex.MULTILINE regex.MULTILINE
), ),
regex.compile( regex.compile(
@ -335,8 +334,8 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
), ),
( (
regex.compile( regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知\s*$|' r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知前附表\s*$', r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表\s*$',
regex.MULTILINE regex.MULTILINE
), ),
regex.compile( regex.compile(

View File

@ -209,7 +209,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \ base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \ r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$' r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)\s*须知正文\s*$'
# 合并基础模式和额外模式 # 合并基础模式和额外模式
if additional_mid_pattern: if additional_mid_pattern:
@ -276,8 +276,8 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
else: else:
# 定义新的 begin_pattern 和 end_pattern # 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = regex.compile( new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|谈判供应商|磋商供应商)须知\s*$|' r'.*(?:投标人|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商)须知前附表', r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表',
regex.MULTILINE regex.MULTILINE
) )
new_end_pattern = regex.compile( new_end_pattern = regex.compile(
@ -294,7 +294,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
output_suffix = "tobidders_notice" output_suffix = "tobidders_notice"
begin_pattern = regex.compile( begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知)+', r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知)+',
regex.MULTILINE regex.MULTILINE
) )
end_pattern = regex.compile( end_pattern = regex.compile(
@ -516,8 +516,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
local_output_suffix = "qualification1" local_output_suffix = "qualification1"
elif selection == 4: elif selection == 4:
begin_pattern = regex.compile( begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知+|' r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知+|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知前附表\s*$', r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表\s*$',
regex.MULTILINE regex.MULTILINE
) )
end_pattern = None end_pattern = None
@ -612,11 +612,11 @@ if __name__ == "__main__":
logger = get_global_logger("123") logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\ztbfile.pdf" pdf_path=r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp" output_folder = r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 5 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
# print(generated_files) # print(generated_files)