zbparse/flask_app/main/投标人须知正文提取指定内容.py
2024-11-01 14:28:10 +08:00

136 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt
# 对于每个target_value元素如果有完美匹配json_data中的键那就加入这个完美匹配的键名否则把全部模糊匹配到的键名都加入
def find_keys_by_value(target_value, json_data):
matched_keys = [k for k, v in json_data.items() if v == target_value] # 首先检查 JSON 中的每个键值对,如果值完全等于目标值,则将这些键收集起来。
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if
isinstance(v, str) and v.startswith(target_value)] # 如果没有找到完全匹配的键,它会检查字符串类型的值是否以目标值开头,并收集这些键。
return matched_keys # eg:[3.1,3.1.1,3.1.2,3.2...]
# 定义查找以特定前缀开始的键的函数eg:若match_keys中有3.1那么以3.1为前缀的键都会被找出来如3.1.1 3.1.2...
def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix)]
return subheadings
# 从完整的json文件中读取所需数据eg:投标、评标
# def extract_json(data, target_values):
# results = {}
#
# # 遍历所有目标值
# for target_value in target_values:
# # 找到所有与目标值匹配的键
# matched_keys = find_keys_by_value(target_value, data)
#
# for key in matched_keys:
# # 查找所有以该键为前缀的子键,限制只提取直接子项
# key_and_subheadings = find_keys_with_prefix(key, data)
#
# for subkey in key_and_subheadings:
# # 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
# if "." in subkey:
# parent_key = subkey.rsplit('.', 1)[0]
# top_level_key = parent_key.split('.')[0] + '.'
#
# # 确保顶级键不会重复添加
# if top_level_key not in results:
# results[top_level_key] = data[top_level_key]
#
# # 添加或更新父级键
# if parent_key not in results:
# if parent_key in data:
# results[parent_key] = data[parent_key]
#
# # 添加当前子键和它的值
# if subkey in data:
# results[subkey] = data[subkey]
#
# return results
def extract_json(data, target_values):
results = {}
for target_value in target_values:
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings:
if "." in subkey:
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
if top_level_key not in results:
results[top_level_key] = target_value
# 添加或更新父级键
if parent_key not in results:
if parent_key in data:
results[parent_key] = data[parent_key]
# 添加当前键
results[subkey] = data[subkey]
return results
def sort_clean_data_keys(data):
# 预处理:删除键名中的空格
def preprocess_key(key):
return re.sub(r'\s+', '', key)
# 将键转换成由整数构成的元组,作为排序依据
def key_func(key):
return tuple(int(part) for part in re.split(r'\D+', key) if part)
# 创建一个新的字典,键名经过预处理
preprocessed_data = {preprocess_key(key): value for key, value in data.items()}
# 对预处理后的字典键进行排序
sorted_keys = sorted(preprocessed_data.keys(), key=key_func)
# 创建一个新的字典,按照排序后的键添加键值对
sorted_data = {key: preprocessed_data[key] for key in sorted_keys}
return sorted_data
"""
递归处理嵌套的数据结构(字典和列表)。
对最内层的字符串值应用 post_process 函数。
post_process 函数尝试将长字符串按特定模式分割成块每块至少包含50个中英文字符。
如果字典中所有值都是 """/" 或空列表,则返回''的列表。
"""
# 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(invalid_path,clause_path, type):
if type == 1:
target_values = ["投标","投标文件","响应文件"]
elif type == 2:
target_values = ["开标", "评标", "定标","磋商程序","中标"]
elif type == 3:
target_values = ["重新招标、不再招标和终止招标","重新招标","重新采购", "不再招标", "不再采购","终止招标","终止采购"]
elif type == 4:
target_values = ["评标"] # 测试
else:
raise ValueError(
"Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'")
with open(clause_path, 'r', encoding='utf-8') as file:
data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json
# print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序
transformed_data = transform_json(sorted_data)
# print(json.dumps(transformed_data,ensure_ascii=False,indent=4))
final_result = process_nested_data(transformed_data)
if not final_result:
final_result = get_requirements_with_gpt(invalid_path, type)
return final_result
if __name__ == "__main__":
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\clause1.json"
try:
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
res2 = json.dumps(res, ensure_ascii=False, indent=4)
print(res2)
except ValueError as e:
print(e)