1.6 商务带星要求在标题中->传递到具体要求中
This commit is contained in:
parent
691883cc99
commit
72d787c899
@ -68,7 +68,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
|||||||
# 定义基础的 mid_pattern
|
# 定义基础的 mid_pattern
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
||||||
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)\s*须知正文\s*$'
|
||||||
# 合并基础模式和额外模式
|
# 合并基础模式和额外模式
|
||||||
if additional_mid_pattern:
|
if additional_mid_pattern:
|
||||||
combined_mid_pattern = regex.compile(
|
combined_mid_pattern = regex.compile(
|
||||||
@ -176,8 +176,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
|||||||
if output_suffix == "qualification":
|
if output_suffix == "qualification":
|
||||||
print("twice:qualificaiton!")
|
print("twice:qualificaiton!")
|
||||||
# 动态设置 include_keys
|
# 动态设置 include_keys
|
||||||
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力",
|
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力","信誉"]
|
||||||
"信誉"]
|
|
||||||
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
|
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||||
begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])'
|
begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])'
|
||||||
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
|
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
|
||||||
@ -260,7 +259,7 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
|
|||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商|应答人)须知(?:前附表)?\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -324,8 +323,8 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
|
|||||||
pattern_pairs = [
|
pattern_pairs = [
|
||||||
(
|
(
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r'(?:第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知|'
|
r'(?:第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知|'
|
||||||
r'(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',
|
r'(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表)\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
regex.compile(
|
regex.compile(
|
||||||
@ -335,8 +334,8 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
|
|||||||
),
|
),
|
||||||
(
|
(
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知\s*$|'
|
||||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
regex.compile(
|
regex.compile(
|
||||||
|
@ -209,7 +209,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
# 定义基础的 mid_pattern
|
# 定义基础的 mid_pattern
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
||||||
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)\s*须知正文\s*$'
|
||||||
|
|
||||||
# 合并基础模式和额外模式
|
# 合并基础模式和额外模式
|
||||||
if additional_mid_pattern:
|
if additional_mid_pattern:
|
||||||
@ -276,8 +276,8 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
else:
|
else:
|
||||||
# 定义新的 begin_pattern 和 end_pattern
|
# 定义新的 begin_pattern 和 end_pattern
|
||||||
new_begin_pattern = regex.compile(
|
new_begin_pattern = regex.compile(
|
||||||
r'.*(?:投标人|磋商|谈判|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
r'.*(?:投标人|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知\s*$|'
|
||||||
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商)须知前附表',
|
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
new_end_pattern = regex.compile(
|
new_end_pattern = regex.compile(
|
||||||
@ -294,7 +294,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
||||||
output_suffix = "tobidders_notice"
|
output_suffix = "tobidders_notice"
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知)+',
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知)+',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = regex.compile(
|
end_pattern = regex.compile(
|
||||||
@ -516,8 +516,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
|||||||
local_output_suffix = "qualification1"
|
local_output_suffix = "qualification1"
|
||||||
elif selection == 4:
|
elif selection == 4:
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知+|'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知+|'
|
||||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|谈判供应商|磋商供应商|应答人)须知前附表\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = None
|
end_pattern = None
|
||||||
@ -612,11 +612,11 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\ztbfile.pdf"
|
pdf_path=r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp"
|
output_folder = r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 5 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
# print(generated_files)
|
# print(generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user