diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 69624b4..30d53f1 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -149,22 +149,35 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): parts = current_section_number.split('.') matched_patterns = [pattern] # start with the full pattern - # Next section at same level - parts[-1] = str(int(parts[-1]) + 1) - # next_pattern = r'^' + r'\.\s*'.join(parts) #11.12以前版本 - next_pattern = r'^' + r'\s*[..]\s*'.join(parts) - matched_patterns.append(next_pattern) + # for i in range(1, 6): #同级,与matched_patterns = [pattern]重复了,故注释 + # # 复制 parts 列表以避免修改原列表 + # new_parts = parts.copy() + # new_parts[-1] = str(int(new_parts[-1]) + i) + # # 使用不同的分隔符 + # next_pattern = r'^' + r'\s*[..]\s*'.join(new_parts) + # matched_patterns.append(next_pattern) # Parent section (if applicable) if len(parts) > 1: - parent_section_parts = parts[:-1] - parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) - parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts) - matched_patterns.append(parent_pattern) + for i in range(1, 6): #考虑原文档的书写不规范,跳序号的情况,目前设置了范围<5 + parent_section_parts = parts[:-1].copy() + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i) + parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts) + matched_patterns.append(parent_pattern) # 添加对 '数字 、' 格式的支持 digit_comma_pattern = r'^\d+\s*、' matched_patterns.append(digit_comma_pattern) + + # 获取当前顶级章节编号 + current_top_level_num = int(current_section_number.split('.')[0]) + for i in range(1, 6): + next_top_level_num = current_top_level_num + i + next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..]' + # 检查是否已经包含了该模式,避免重复 + if next_top_level_pattern not in matched_patterns: + matched_patterns.append(next_top_level_pattern) + # Combine the patterns combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' current_section_pattern = re.compile(combined_pattern) diff --git a/flask_app/main/禁止投标情形.py b/flask_app/main/禁止投标情形.py index b7cdf04..5088bdf 100644 --- a/flask_app/main/禁止投标情形.py +++ b/flask_app/main/禁止投标情形.py @@ -151,9 +151,10 @@ def find_forbidden(qualification=""): actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"] includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"] excludes = ["招标", "评标", "定标"] - forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes) - processed_results = extract_unique_items_from_texts(forbidden_results) - merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) + # forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes) + # processed_results = extract_unique_items_from_texts(forbidden_results) + # merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) + merged_forbidden_list = list(dict.fromkeys(actual_list)) forbidden_dict = {'不得存在的其他情形': merged_forbidden_list} return forbidden_dict except Exception as e: @@ -162,10 +163,10 @@ def find_forbidden(qualification=""): #TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。 if __name__ == '__main__': - truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" - clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" + # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" + # clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f" doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx' - res = find_forbidden(truncate_json_path, clause_path, qualification) + res = find_forbidden(qualification) print(json.dumps(res, ensure_ascii=False, indent=4)) \ No newline at end of file