diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index 8f28e03..ce104d8 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -183,7 +183,7 @@ def process_nested_data(data): # 到达最内层,处理非字典和非列表的元素(字符串) return post_process(data) -#生成无结构的数据 +#生成无结构的数据货物标 def concatenate_keys_values(section_content): """ 将章节内容的键值对拼接成一个字符串列表,每个元素为 "key value"。 @@ -199,6 +199,53 @@ def concatenate_keys_values(section_content): concatenated.append(f"{key} {value}") return concatenated +#生成无结构的数据工程标 +def extract_sections(data, target_values): + """ + Extracts sections from the input dictionary where the top-level keys' values + match the target_values. For each matching section, collects all sub-keys + and their corresponding values as a list of formatted strings. If "定标" + and "中标" are in the input data, they are merged into a single "定标与中标" key. + + Args: + data (dict): The input dictionary with hierarchical keys. + target_values (list): List of target section names to extract. + + Returns: + dict: A dictionary with target section names as keys and lists of + formatted sub-section strings as values. + """ + result = {} + merged_sections = [] + + # Sort the keys to maintain order + sorted_keys = sorted(data.keys(), key=lambda x: [int(part) for part in x.strip('.').split('.')]) + + for key in sorted_keys: + value = data[key] + if value in target_values: + section_key_prefix = key # e.g., "5." + section_name = value # e.g., "开标" + subitems = [] + + for sub_key in sorted_keys: + # Check if the sub_key starts with the section_key_prefix and is not the section_key itself + if sub_key.startswith(section_key_prefix) and sub_key != section_key_prefix: + sub_value = data[sub_key] + subitems.append(f"{sub_key} {sub_value}") + + # Check for "定标" and "中标" to merge them + if section_name in ["定标", "中标"]: + merged_sections.extend(subitems) + else: + result[section_name] = subitems + + # Merge "定标" and "中标" into "定标与中标" if both were found + if merged_sections: + result["定标与中标"] = merged_sections + + return result + def get_requirements_with_gpt(merged_baseinfo_path, selection): """ 根据 selection 的值选择相应的用户查询,并调用大模型获取要求。 diff --git a/flask_app/main/投标人须知正文提取指定内容.py b/flask_app/main/投标人须知正文提取指定内容.py index 98ee68d..749834b 100644 --- a/flask_app/main/投标人须知正文提取指定内容.py +++ b/flask_app/main/投标人须知正文提取指定内容.py @@ -1,6 +1,6 @@ import json import re -from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt +from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt,extract_sections # 对于每个target_value元素,如果有完美匹配json_data中的键,那就加入这个完美匹配的键名,否则,把全部模糊匹配到的键名都加入 @@ -116,19 +116,22 @@ def extract_from_notice(invalid_path,clause_path, type): data = json.load(file) extracted_data = extract_json(data, target_values) # 读取json # print(json.dumps(extracted_data,ensure_ascii=False,indent=4)) - sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序 - transformed_data = transform_json(sorted_data) + res=extract_sections(extracted_data,target_values) + print(json.dumps(res, ensure_ascii=False, indent=4)) + # sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序 + # transformed_data = transform_json(sorted_data) # print(json.dumps(transformed_data,ensure_ascii=False,indent=4)) - final_result = process_nested_data(transformed_data) - if not final_result: - final_result = get_requirements_with_gpt(invalid_path, type) - return final_result + # final_result = process_nested_data(transformed_data) + # if not final_result: + # final_result = get_requirements_with_gpt(invalid_path, type) + # return final_result if __name__ == "__main__": # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json' - file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\clause1.json" + invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\4e5bc6c2-c2b8-4c0b-8e57-81a498b982f6\\ztbfile_invalid.pdf" + clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\4e5bc6c2-c2b8-4c0b-8e57-81a498b982f6\\clause1.json" try: - res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景 + res = extract_from_notice(invalid_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景 res2 = json.dumps(res, ensure_ascii=False, indent=4) print(res2) except ValueError as e: diff --git a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py index 49bb387..84f99fd 100644 --- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py +++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py @@ -107,11 +107,11 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type): final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型 return final_result # print(json.dumps(extracted_data,ensure_ascii=False,indent=4)) - extracted_data_concatenated = {section: concatenate_keys_values(content) + extracted_data_concatenated = {section: concatenate_keys_values(content) #启用结构化就注释这三行 for section, content in extracted_data.items()} return extracted_data_concatenated - # transformed_data = process_with_outer_key(extracted_data) + # transformed_data = process_with_outer_key(extracted_data) #取消注释这三行 # final_result = process_nested_data(transformed_data) # return final_result