10.29截取工程标新逻辑
This commit is contained in:
parent
54b2738ea1
commit
5a5ac65f34
@ -53,7 +53,8 @@ def aggregate_basic_info_engineering(baseinfo_list):
|
|||||||
dynamic_key_handling(key_groups, relevant_keys_detected)
|
dynamic_key_handling(key_groups, relevant_keys_detected)
|
||||||
|
|
||||||
# 创建一个副本以存储未分类的项目
|
# 创建一个副本以存储未分类的项目
|
||||||
unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]}
|
unclassified_items = {k: v for k, v in combined_data.items() if
|
||||||
|
k not in [item for sublist in key_groups.values() for item in sublist]}
|
||||||
|
|
||||||
# 按键组分类并嵌套
|
# 按键组分类并嵌套
|
||||||
for group_name, keys in key_groups.items():
|
for group_name, keys in key_groups.items():
|
||||||
@ -73,6 +74,7 @@ def aggregate_basic_info_engineering(baseinfo_list):
|
|||||||
|
|
||||||
return combined_data
|
return combined_data
|
||||||
|
|
||||||
|
|
||||||
def dynamic_key_handling(key_groups, detected_keys):
|
def dynamic_key_handling(key_groups, detected_keys):
|
||||||
# 检查和调整键组配置
|
# 检查和调整键组配置
|
||||||
for key in detected_keys:
|
for key in detected_keys:
|
||||||
@ -120,6 +122,7 @@ def update_baseinfo_lists(baseinfo_list1, baseinfo_list2):
|
|||||||
|
|
||||||
return updated_list
|
return updated_list
|
||||||
|
|
||||||
|
|
||||||
def process_judge_questions(judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1):
|
def process_judge_questions(judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1):
|
||||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||||
judge_consortium = judge_consortium_bidding(baseinfo_list1)
|
judge_consortium = judge_consortium_bidding(baseinfo_list1)
|
||||||
@ -138,6 +141,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, tobidders_notice_ta
|
|||||||
for question, response in res2:
|
for question, response in res2:
|
||||||
baseinfo_list1.append(clean_json_string(response))
|
baseinfo_list1.append(clean_json_string(response))
|
||||||
|
|
||||||
|
|
||||||
def process_baseinfo_list(baseinfo_list, tobidders_notice):
|
def process_baseinfo_list(baseinfo_list, tobidders_notice):
|
||||||
questions_list = []
|
questions_list = []
|
||||||
for item in baseinfo_list:
|
for item in baseinfo_list:
|
||||||
@ -157,6 +161,7 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice):
|
|||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def combine_basic_info(merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
|
def combine_basic_info(merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
|
||||||
"""
|
"""
|
||||||
综合和处理基础信息,生成最终的基础信息字典。
|
综合和处理基础信息,生成最终的基础信息字典。
|
||||||
@ -189,7 +194,8 @@ def combine_basic_info(merged_baseinfo_path,tobidders_notice_table, tobidders_no
|
|||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||||
# 提交两个任务
|
# 提交两个任务
|
||||||
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1)
|
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table,
|
||||||
|
baseinfo_list1)
|
||||||
future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice)
|
future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice)
|
||||||
|
|
||||||
# 等待两个任务完成并获取结果
|
# 等待两个任务完成并获取结果
|
||||||
@ -205,6 +211,7 @@ def combine_basic_info(merged_baseinfo_path,tobidders_notice_table, tobidders_no
|
|||||||
aggregated_baseinfo = aggregate_basic_info_engineering(updated_list)
|
aggregated_baseinfo = aggregate_basic_info_engineering(updated_list)
|
||||||
return {"基础信息": aggregated_baseinfo}
|
return {"基础信息": aggregated_baseinfo}
|
||||||
|
|
||||||
|
|
||||||
# TODO:先不带投标人须知正文,如果是未知,再直接问正文,
|
# TODO:先不带投标人须知正文,如果是未知,再直接问正文,
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
@ -217,7 +224,3 @@ if __name__ == "__main__":
|
|||||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
print(json.dumps(res, ensure_ascii=False, indent=4))
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("elapsed_time:" + str(end_time - start_time))
|
print("elapsed_time:" + str(end_time - start_time))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -196,7 +196,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_
|
|||||||
r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon
|
r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
|
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
|
||||||
|
|
||||||
# 定义基础的 mid_pattern
|
# 定义基础的 mid_pattern
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
||||||
@ -204,9 +204,6 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_
|
|||||||
rf'{base_mid_pattern}',
|
rf'{base_mid_pattern}',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
print(
|
|
||||||
f"使用默认的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印默认的 combined_mid_pattern
|
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||||
@ -306,6 +303,8 @@ def get_start_and_common_header(input_path):
|
|||||||
begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE)
|
begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE)
|
||||||
pdf_document = PdfReader(input_path)
|
pdf_document = PdfReader(input_path)
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
|
if i > 25:
|
||||||
|
return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
@ -371,7 +370,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$', re.MULTILINE),
|
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE),
|
||||||
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE)
|
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
@ -381,7 +380,12 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
# Selection 5: 无效标
|
# Selection 5: 无效标
|
||||||
pattern_pairs = [
|
pattern_pairs = [
|
||||||
(
|
(
|
||||||
re.compile(r'第[一二三四五六七八九十]+章\s*(招标公告|.*邀请.*)|第一卷|招标编号:|招标编号:'),
|
re.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*|^第一卷|^投标邀请书'),
|
||||||
|
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE)
|
||||||
|
),
|
||||||
|
(
|
||||||
|
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE),
|
||||||
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE)
|
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
@ -393,8 +397,14 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
|
|
||||||
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
|
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
|
||||||
is_secondary_match = (idx == 2)
|
is_secondary_match = (idx == 2)
|
||||||
|
begin_page = last_begin_index if last_begin_index != 0 else {
|
||||||
|
1: 3, #前附表
|
||||||
|
2: 10, #评标
|
||||||
|
3: 5, #资格
|
||||||
|
4: 0, #公告
|
||||||
|
5: 0 #无效标
|
||||||
|
}.get(selection, 0)
|
||||||
if selection == 1: #投标人须知
|
if selection == 1: #投标人须知
|
||||||
begin_page = 3
|
|
||||||
output_paths = list(
|
output_paths = list(
|
||||||
extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header))
|
extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header))
|
||||||
if output_paths and any(os.path.isfile(f) for f in output_paths):
|
if output_paths and any(os.path.isfile(f) for f in output_paths):
|
||||||
@ -402,10 +412,6 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
else:
|
else:
|
||||||
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
|
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
|
||||||
continue
|
continue
|
||||||
if selection == 2: #评分细则
|
|
||||||
begin_page = 10
|
|
||||||
else:
|
|
||||||
begin_page = last_begin_index
|
|
||||||
output_paths = extract_pages(
|
output_paths = extract_pages(
|
||||||
input_path,
|
input_path,
|
||||||
output_folder,
|
output_folder,
|
||||||
@ -669,7 +675,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
|
|||||||
#TODO:目前merged_baseinfo没有包含投标人须知正文。
|
#TODO:目前merged_baseinfo没有包含投标人须知正文。
|
||||||
#投标人须知前附表改为货物标一样的
|
#投标人须知前附表改为货物标一样的
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # 可以是单个PDF文件路径或文件夹路径 #zbtest20.pdf zbtest4_evaluation_method.pdf zbtest8.pdf
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest7.pdf" # 可以是单个PDF文件路径或文件夹路径 #zbtest20.pdf zbtest4_evaluation_method.pdf zbtest8.pdf
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\新建文件夹"
|
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\新建文件夹"
|
||||||
@ -679,5 +685,5 @@ if __name__ == "__main__":
|
|||||||
# print(files)
|
# print(files)
|
||||||
selection = 5 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
selection = 5 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
@ -180,7 +180,7 @@ def convert_clause_to_json(input_path,output_folder,type=1):
|
|||||||
r'^附(?:录|件|表)(?:一)?[::]'
|
r'^附(?:录|件|表)(?:一)?[::]'
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:|投标邀请书|投标邀请函'
|
||||||
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
||||||
result = convert_to_json(input_path, start_word, end_phrases) #过滤无关信息
|
result = convert_to_json(input_path, start_word, end_phrases) #过滤无关信息
|
||||||
# 检查输出文件夹是否存在,如果不存在则创建
|
# 检查输出文件夹是否存在,如果不存在则创建
|
||||||
|
Loading…
x
Reference in New Issue
Block a user