11.13
This commit is contained in:
parent
6e7da080bc
commit
4da7469e04
@ -149,22 +149,35 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
parts = current_section_number.split('.')
|
||||
matched_patterns = [pattern] # start with the full pattern
|
||||
|
||||
# Next section at same level
|
||||
parts[-1] = str(int(parts[-1]) + 1)
|
||||
# next_pattern = r'^' + r'\.\s*'.join(parts) #11.12以前版本
|
||||
next_pattern = r'^' + r'\s*[..]\s*'.join(parts)
|
||||
matched_patterns.append(next_pattern)
|
||||
# for i in range(1, 6): #同级,与matched_patterns = [pattern]重复了,故注释
|
||||
# # 复制 parts 列表以避免修改原列表
|
||||
# new_parts = parts.copy()
|
||||
# new_parts[-1] = str(int(new_parts[-1]) + i)
|
||||
# # 使用不同的分隔符
|
||||
# next_pattern = r'^' + r'\s*[..]\s*'.join(new_parts)
|
||||
# matched_patterns.append(next_pattern)
|
||||
|
||||
# Parent section (if applicable)
|
||||
if len(parts) > 1:
|
||||
parent_section_parts = parts[:-1]
|
||||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
|
||||
for i in range(1, 6): #考虑原文档的书写不规范,跳序号的情况,目前设置了范围<5
|
||||
parent_section_parts = parts[:-1].copy()
|
||||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
|
||||
parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts)
|
||||
matched_patterns.append(parent_pattern)
|
||||
|
||||
# 添加对 '数字 、' 格式的支持
|
||||
digit_comma_pattern = r'^\d+\s*、'
|
||||
matched_patterns.append(digit_comma_pattern)
|
||||
|
||||
# 获取当前顶级章节编号
|
||||
current_top_level_num = int(current_section_number.split('.')[0])
|
||||
for i in range(1, 6):
|
||||
next_top_level_num = current_top_level_num + i
|
||||
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..]'
|
||||
# 检查是否已经包含了该模式,避免重复
|
||||
if next_top_level_pattern not in matched_patterns:
|
||||
matched_patterns.append(next_top_level_pattern)
|
||||
|
||||
# Combine the patterns
|
||||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||||
current_section_pattern = re.compile(combined_pattern)
|
||||
|
@ -151,9 +151,10 @@ def find_forbidden(qualification=""):
|
||||
actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"]
|
||||
includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
|
||||
excludes = ["招标", "评标", "定标"]
|
||||
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes)
|
||||
processed_results = extract_unique_items_from_texts(forbidden_results)
|
||||
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
||||
# forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes)
|
||||
# processed_results = extract_unique_items_from_texts(forbidden_results)
|
||||
# merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
||||
merged_forbidden_list = list(dict.fromkeys(actual_list))
|
||||
forbidden_dict = {'不得存在的其他情形': merged_forbidden_list}
|
||||
return forbidden_dict
|
||||
except Exception as e:
|
||||
@ -162,10 +163,10 @@ def find_forbidden(qualification=""):
|
||||
|
||||
#TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。
|
||||
if __name__ == '__main__':
|
||||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json"
|
||||
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
|
||||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json"
|
||||
# clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
|
||||
qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
|
||||
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f"
|
||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx'
|
||||
res = find_forbidden(truncate_json_path, clause_path, qualification)
|
||||
res = find_forbidden(qualification)
|
||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
Loading…
x
Reference in New Issue
Block a user