This commit is contained in:
zy123 2024-11-13 10:44:37 +08:00
parent 6e7da080bc
commit 4da7469e04
2 changed files with 29 additions and 15 deletions

View File

@ -149,22 +149,35 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
parts = current_section_number.split('.') parts = current_section_number.split('.')
matched_patterns = [pattern] # start with the full pattern matched_patterns = [pattern] # start with the full pattern
# Next section at same level # for i in range(1, 6): #同级与matched_patterns = [pattern]重复了,故注释
parts[-1] = str(int(parts[-1]) + 1) # # 复制 parts 列表以避免修改原列表
# next_pattern = r'^' + r'\.\s*'.join(parts) #11.12以前版本 # new_parts = parts.copy()
next_pattern = r'^' + r'\s*[.]\s*'.join(parts) # new_parts[-1] = str(int(new_parts[-1]) + i)
matched_patterns.append(next_pattern) # # 使用不同的分隔符
# next_pattern = r'^' + r'\s*[.]\s*'.join(new_parts)
# matched_patterns.append(next_pattern)
# Parent section (if applicable) # Parent section (if applicable)
if len(parts) > 1: if len(parts) > 1:
parent_section_parts = parts[:-1] for i in range(1, 6): #考虑原文档的书写不规范,跳序号的情况,目前设置了范围<5
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) parent_section_parts = parts[:-1].copy()
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
parent_pattern = r'^' + r'\s*[.]\s*'.join(parent_section_parts) parent_pattern = r'^' + r'\s*[.]\s*'.join(parent_section_parts)
matched_patterns.append(parent_pattern) matched_patterns.append(parent_pattern)
# 添加对 '数字 、' 格式的支持 # 添加对 '数字 、' 格式的支持
digit_comma_pattern = r'^\d+\s*、' digit_comma_pattern = r'^\d+\s*、'
matched_patterns.append(digit_comma_pattern) matched_patterns.append(digit_comma_pattern)
# 获取当前顶级章节编号
current_top_level_num = int(current_section_number.split('.')[0])
for i in range(1, 6):
next_top_level_num = current_top_level_num + i
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.]'
# 检查是否已经包含了该模式,避免重复
if next_top_level_pattern not in matched_patterns:
matched_patterns.append(next_top_level_pattern)
# Combine the patterns # Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern) current_section_pattern = re.compile(combined_pattern)

View File

@ -151,9 +151,10 @@ def find_forbidden(qualification=""):
actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"] actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"]
includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"] includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
excludes = ["招标", "评标", "定标"] excludes = ["招标", "评标", "定标"]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes) # forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes)
processed_results = extract_unique_items_from_texts(forbidden_results) # processed_results = extract_unique_items_from_texts(forbidden_results)
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) # merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
merged_forbidden_list = list(dict.fromkeys(actual_list))
forbidden_dict = {'不得存在的其他情形': merged_forbidden_list} forbidden_dict = {'不得存在的其他情形': merged_forbidden_list}
return forbidden_dict return forbidden_dict
except Exception as e: except Exception as e:
@ -162,10 +163,10 @@ def find_forbidden(qualification=""):
#TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。 #TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。
if __name__ == '__main__': if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" # clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f" output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f"
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx' doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx'
res = find_forbidden(truncate_json_path, clause_path, qualification) res = find_forbidden(qualification)
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))