2024-09-11 12:02:09 +08:00
|
|
|
|
import json
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_and_format_from_paths(json_paths, includes, excludes):
|
|
|
|
|
"""
|
|
|
|
|
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
json_paths (list): 包含多个 JSON 文件路径的列表。
|
|
|
|
|
includes (list): 包含要检查的关键词的列表。
|
|
|
|
|
excludes (list): 包含要排除的关键词的列表。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
list: 包含所有文件中满足条件的格式化字符串列表。
|
|
|
|
|
"""
|
|
|
|
|
all_formatted_results = []
|
|
|
|
|
|
|
|
|
|
# 遍历每个文件路径
|
|
|
|
|
for path in json_paths:
|
|
|
|
|
try:
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as file:
|
|
|
|
|
# 加载 JSON 数据
|
|
|
|
|
json_data = json.load(file)
|
|
|
|
|
formatted_results = []
|
|
|
|
|
|
|
|
|
|
# 遍历 JSON 数据的每个键值对
|
|
|
|
|
for key, value in json_data.items():
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
# 如果值是字典,检查嵌套字典的每个键值对
|
|
|
|
|
for sub_key, sub_value in value.items():
|
|
|
|
|
if any(include in sub_key for include in includes):
|
|
|
|
|
# 如果子值包含关键词,格式化并添加到结果列表
|
|
|
|
|
formatted_results.append(f"{sub_value}")
|
|
|
|
|
elif isinstance(value, str): # clause
|
|
|
|
|
# 检查是否包含任何 include 关键词
|
|
|
|
|
for include in includes:
|
|
|
|
|
if include in value:
|
|
|
|
|
# 找到 include 之前的内容
|
|
|
|
|
prefix = value.split(include)[0]
|
|
|
|
|
# 检查 prefix 是否不包含任何 exclude 关键词
|
|
|
|
|
if not any(exclude in prefix for exclude in excludes):
|
|
|
|
|
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
|
|
|
|
|
if '\n' in value:
|
|
|
|
|
value = value.split('\n', 1)[-1]
|
|
|
|
|
formatted_results.append(value)
|
|
|
|
|
break # 找到一个符合条件的就跳出循环
|
|
|
|
|
|
|
|
|
|
# 将当前文件的结果添加到总结果列表
|
|
|
|
|
all_formatted_results.extend(formatted_results)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f"Error: The file '{path}' does not exist.")
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
print(f"Error: The file '{path}' contains invalid JSON.")
|
|
|
|
|
|
|
|
|
|
return all_formatted_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_unique_items_from_texts(texts):
|
|
|
|
|
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
|
|
|
|
|
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
|
|
|
|
|
intro_pattern = re.compile(r'^.*?[::]')
|
|
|
|
|
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
|
|
|
|
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
|
|
|
|
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
|
|
|
|
|
|
|
|
|
|
all_results = []
|
|
|
|
|
seen = set()
|
|
|
|
|
|
|
|
|
|
for text in texts:
|
|
|
|
|
# 去除文本中的制表符和换行符
|
|
|
|
|
text = text.replace('\t', '').replace('\n', '')
|
|
|
|
|
|
|
|
|
|
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
|
|
|
|
|
text = intro_pattern.sub('', text)
|
|
|
|
|
|
|
|
|
|
# 替换URL为占位符,并保存URL以便后续还原
|
|
|
|
|
urls = []
|
|
|
|
|
def url_replacer(match):
|
|
|
|
|
urls.append(match.group(0))
|
|
|
|
|
return f"{{URL{len(urls)}}}"
|
|
|
|
|
text = url_pattern.sub(url_replacer, text)
|
|
|
|
|
|
|
|
|
|
# 使用数字和括号的模式分割文本
|
|
|
|
|
items = pattern.split(text)
|
|
|
|
|
|
|
|
|
|
for item in items:
|
|
|
|
|
cleaned_item = item.strip()
|
|
|
|
|
if cleaned_item:
|
|
|
|
|
# 进一步清理每个条目
|
|
|
|
|
cleaned_item = pattern.sub('', cleaned_item)
|
|
|
|
|
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
|
|
|
|
cleaned_item = cleaned_item.strip()
|
|
|
|
|
|
|
|
|
|
# 还原URL
|
|
|
|
|
for i, url in enumerate(urls, 1):
|
|
|
|
|
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
|
|
|
|
|
|
|
|
|
|
# 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符
|
|
|
|
|
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
|
|
|
|
|
seen.add(cleaned_item)
|
|
|
|
|
all_results.append(cleaned_item)
|
|
|
|
|
|
|
|
|
|
return all_results
|
|
|
|
|
# 使用上面定义的函数
|
|
|
|
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
|
|
|
|
|
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
|
|
|
|
|
json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写
|
|
|
|
|
includes = ["不得存在","不得与","禁止投标","对投标人的纪律"]
|
|
|
|
|
excludes=["招标","评标","定标"]
|
|
|
|
|
# 调用函数
|
|
|
|
|
results = extract_and_format_from_paths(json_paths, includes,excludes)
|
|
|
|
|
print(results)
|
|
|
|
|
res=extract_unique_items_from_texts(results)
|
|
|
|
|
print(res)
|