zbparse/flask_app/main/投标人须知正文提取指定内容.py

121 lines
4.8 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
import json
import re
# 定义查找与目标值匹配的键的函数
def find_keys_by_value(target_value, json_data):
matched_keys = [k for k, v in json_data.items() if v == target_value]
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)]
return matched_keys
# 定义查找以特定前缀开始的键的函数
def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix)]
return subheadings
# 从文件中读取JSON数据并提取特定内容
def extract_json(data, target_values):
results = {}
for target_value in target_values:
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings:
if "." in subkey:
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
if target_value == "定标" and top_level_key not in results:
results[top_level_key] = "定标"
# 添加或更新父级键
if parent_key not in results:
if parent_key in data:
results[parent_key] = data[parent_key]
# 添加当前键
results[subkey] = data[subkey]
return results
# 转换结构化的JSON数据
def transform_json(data):
result = {}
temp = {0: result}
for key, value in data.items():
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
if match:
levels = [int(l) for l in match.groups() if l is not None]
parent = temp[len(levels) - 1]
if len(levels) == len(match.groups()):
if isinstance(parent, list):
parent.append(value)
else:
# 对于没有 \n 的情况,使用首个空格分割的词作为键
parent[value.split()[0]] = value
else:
new_key = value.split()[0]
if '\n' in value and len(levels) == 2:
# 处理换行情况并分割键和值
new_key, new_value = value.split('\n', 1)
new_key = new_key.strip()
new_value = new_value.strip()
# 确保父级是字典
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent[-1][new_key] = new_value
else:
parent[new_key] = new_value
else:
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent = parent[-1]
if new_key not in parent:
parent[new_key] = []
temp[len(levels)] = parent[new_key]
# 修改函数以移除只有一个元素的列表和空列表
def remove_single_item_lists(node):
if isinstance(node, dict):
for key in list(node.keys()):
node[key] = remove_single_item_lists(node[key])
if isinstance(node[key], list) and not node[key]:
node[key] = "" # 如果列表为空,转换为空字符串
elif isinstance(node, list) and len(node) == 1:
return remove_single_item_lists(node[0])
return node
return remove_single_item_lists(result)
# 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(clause_path, type):
2024-08-29 16:37:09 +08:00
if type == 1:
target_values = ["投标文件", "投标"]
elif type == 2:
target_values = ["开标", "评标", "定标"]
elif type == 3:
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
else:
raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.")
with open(clause_path, 'r', encoding='utf-8') as file:
2024-08-29 16:37:09 +08:00
data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json
transformed_data = transform_json(extracted_data)
return transformed_data
# 假设原始数据文件路径
if __name__ == "__main__":
file_path = 'clause2.json'
try:
res = extract_from_notice(file_path, 3) # 可以改变此处的 type 参数测试不同的场景
print(res)
except ValueError as e:
print(e)