2024-11-21 16:22:22 +08:00
|
|
|
|
import json
|
2024-11-26 15:06:57 +08:00
|
|
|
|
import re
|
2024-11-22 16:33:00 +08:00
|
|
|
|
|
2024-11-21 16:22:22 +08:00
|
|
|
|
|
2024-11-26 15:06:57 +08:00
|
|
|
|
def insert_missing_commas(json_str):
|
|
|
|
|
"""
|
|
|
|
|
使用正则表达式在缺失逗号的位置插入逗号。
|
|
|
|
|
具体来说,寻找一个值的结束引号后紧跟着下一个键的开始引号,并在中间插入逗号。
|
|
|
|
|
"""
|
|
|
|
|
# 这个正则匹配一个字符串结尾的引号,可能有空白字符,然后是另一个键的引号
|
|
|
|
|
pattern = r'(":\s*"[^"]*)"\s*(")'
|
|
|
|
|
replacement = r'\1", \2'
|
|
|
|
|
|
|
|
|
|
previous_str = None
|
|
|
|
|
while previous_str != json_str:
|
|
|
|
|
previous_str = json_str
|
|
|
|
|
json_str = re.sub(pattern, replacement, json_str)
|
|
|
|
|
|
|
|
|
|
return json_str
|
2024-11-25 10:50:01 +08:00
|
|
|
|
|
2024-11-25 11:20:06 +08:00
|
|
|
|
|
2024-11-26 15:06:57 +08:00
|
|
|
|
def fix_json_escape_sequences(json_str):
|
2024-11-25 11:20:06 +08:00
|
|
|
|
"""
|
2024-11-26 15:06:57 +08:00
|
|
|
|
修复 JSON 字符串中的非法转义序列。
|
|
|
|
|
将所有不符合 JSON 规范的反斜杠进行转义。
|
|
|
|
|
"""
|
|
|
|
|
# JSON 中合法的转义字符
|
|
|
|
|
valid_escapes = ['"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u']
|
|
|
|
|
|
|
|
|
|
# 使用正则表达式找到所有反斜杠
|
|
|
|
|
# 如果反斜杠后面不是合法的转义字符,则进行转义
|
|
|
|
|
pattern = re.compile(r'\\(?!["\\/bfnrtu])')
|
|
|
|
|
fixed_str = pattern.sub(r'\\\\', json_str)
|
|
|
|
|
return fixed_str
|
2024-11-25 11:20:06 +08:00
|
|
|
|
|
2024-11-26 15:06:57 +08:00
|
|
|
|
|
|
|
|
|
def replace_latex_expressions(json_str):
|
2024-11-25 11:20:06 +08:00
|
|
|
|
"""
|
2024-11-26 15:06:57 +08:00
|
|
|
|
替换 JSON 字符串中的 LaTeX 风格表达式。
|
|
|
|
|
例如,将 $三 \geq 2 m$ 替换为 三 ≥2米
|
|
|
|
|
"""
|
|
|
|
|
# 定义 LaTeX 符号到 Unicode 字符的映射
|
|
|
|
|
latex_mapping = {
|
|
|
|
|
r'\geq': '≥',
|
|
|
|
|
r'\leq': '≤',
|
|
|
|
|
r'\times': '×',
|
|
|
|
|
r'\frac': '/', # 简单处理分数
|
|
|
|
|
r'\neq': '≠',
|
|
|
|
|
r'\approx': '≈',
|
|
|
|
|
r'\pm': '±',
|
|
|
|
|
r'\alpha': 'α',
|
|
|
|
|
r'\beta': 'β',
|
|
|
|
|
r'\gamma': 'γ',
|
|
|
|
|
r'\delta': 'δ',
|
|
|
|
|
r'\pi': 'π',
|
|
|
|
|
r'\sqrt': '√',
|
|
|
|
|
r'\infty': '∞',
|
|
|
|
|
r'\cup': '∪',
|
|
|
|
|
r'\cap': '∩',
|
|
|
|
|
r'\subseteq': '⊆',
|
|
|
|
|
r'\supseteq': '⊇',
|
|
|
|
|
r'\forall': '∀',
|
|
|
|
|
r'\exists': '∃',
|
|
|
|
|
r'\rightarrow': '→',
|
|
|
|
|
r'\leftarrow': '←',
|
|
|
|
|
# 添加更多需要的映射
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 处理每一个 LaTeX 表达式
|
|
|
|
|
def replace_match(match):
|
|
|
|
|
expr = match.group(1)
|
|
|
|
|
for latex, char in latex_mapping.items():
|
|
|
|
|
expr = expr.replace(latex, char)
|
|
|
|
|
# 替换单位符号,例如 ' m' 到 '米', ' s' 到 '秒', 等等
|
|
|
|
|
expr = re.sub(r'(\d+)\s*m', r'\1米', expr)
|
|
|
|
|
expr = re.sub(r'(\d+)\s*s', r'\1秒', expr)
|
|
|
|
|
expr = re.sub(r'(\d+)\s*kg', r'\1公斤', expr)
|
|
|
|
|
expr = re.sub(r'(\d+)\s*A', r'\1安', expr) # 例如电流单位安培
|
|
|
|
|
# 继续添加更多单位
|
|
|
|
|
return expr
|
|
|
|
|
|
|
|
|
|
# 替换所有 $...$ 包围的内容
|
|
|
|
|
fixed_str = re.sub(r'\$(.*?)\$', replace_match, json_str)
|
|
|
|
|
return fixed_str
|
2024-11-25 11:20:06 +08:00
|
|
|
|
|
2024-11-26 15:06:57 +08:00
|
|
|
|
|
|
|
|
|
def extract_content_from_json(string):
|
|
|
|
|
"""
|
|
|
|
|
输入字符串,尝试解析 JSON 数据:
|
|
|
|
|
1. 尝试直接解析原始 JSON。
|
|
|
|
|
2. 如果直接解析失败,按顺序使用不同修复方法尝试解析。
|
|
|
|
|
"""
|
|
|
|
|
if not string.strip():
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
# 提取第一个匹配的 JSON 对象
|
|
|
|
|
match = re.search(r'\{[\s\S]*\}', string)
|
|
|
|
|
if not match:
|
|
|
|
|
print("未找到有效的 JSON 内容。")
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
original_json = match.group(0)
|
|
|
|
|
|
|
|
|
|
# 尝试直接解析原始 JSON 数据
|
|
|
|
|
try:
|
|
|
|
|
parsed = json.loads(original_json)
|
|
|
|
|
print("直接解析原始 JSON 成功。")
|
|
|
|
|
return parsed
|
|
|
|
|
except json.JSONDecodeError as original_error:
|
|
|
|
|
print(f"直接解析原始 JSON 失败: {original_error}")
|
|
|
|
|
|
|
|
|
|
# 方法1:逗号修复
|
|
|
|
|
try:
|
|
|
|
|
fixed_json1 = insert_missing_commas(original_json)
|
|
|
|
|
parsed = json.loads(fixed_json1)
|
|
|
|
|
print("使用方法1:逗号修复成功。")
|
|
|
|
|
return parsed
|
|
|
|
|
except json.JSONDecodeError as error1:
|
|
|
|
|
print(f"方法1(逗号修复)解析失败: {error1}")
|
|
|
|
|
|
|
|
|
|
# 方法2:LaTeX 表达式替换
|
|
|
|
|
try:
|
|
|
|
|
fixed_json2 = replace_latex_expressions(original_json)
|
|
|
|
|
parsed = json.loads(fixed_json2)
|
|
|
|
|
print("使用方法2:LaTeX 表达式替换成功。")
|
|
|
|
|
return parsed
|
|
|
|
|
except json.JSONDecodeError as error2:
|
|
|
|
|
print(f"方法2(LaTeX 替换)解析失败: {error2}")
|
|
|
|
|
|
|
|
|
|
# 方法3:非法转义序列修复
|
|
|
|
|
try:
|
|
|
|
|
fixed_json3 = fix_json_escape_sequences(original_json)
|
|
|
|
|
parsed = json.loads(fixed_json3)
|
|
|
|
|
print("使用方法3:非法转义序列修复成功。")
|
|
|
|
|
return parsed
|
|
|
|
|
except json.JSONDecodeError as error3:
|
|
|
|
|
print(f"方法3(非法转义修复)解析失败: {error3}")
|
|
|
|
|
|
|
|
|
|
# 如果所有方法都失败,返回空字典
|
|
|
|
|
print("所有修复方法均失败,返回空字典。")
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def replace_latex_expressions_in_dict(obj):
|
|
|
|
|
"""
|
|
|
|
|
递归遍历字典或列表,替换其中的 LaTeX 表达式。
|
|
|
|
|
"""
|
|
|
|
|
if isinstance(obj, dict):
|
|
|
|
|
return {k: replace_latex_expressions_in_dict(v) for k, v in obj.items()}
|
|
|
|
|
elif isinstance(obj, list):
|
|
|
|
|
return [replace_latex_expressions_in_dict(item) for item in obj]
|
|
|
|
|
elif isinstance(obj, str):
|
|
|
|
|
# 仅处理被 $...$ 包围的内容
|
|
|
|
|
def replace_match(match):
|
|
|
|
|
expr = match.group(1)
|
|
|
|
|
return replace_latex_expressions(expr)
|
|
|
|
|
|
|
|
|
|
# 替换所有 $...$ 包围的内容
|
|
|
|
|
return re.sub(r'\$(.*?)\$', replace_match, obj)
|
|
|
|
|
else:
|
|
|
|
|
return obj
|
|
|
|
|
test_json_1 = """
|
|
|
|
|
{
|
|
|
|
|
"示例": "速度为 $v = 3 \\times 10^2 m/s$,加速度为 $a \\geq 9.8 m/s^2$。"
|
2024-11-22 16:33:00 +08:00
|
|
|
|
}
|
2024-11-26 15:06:57 +08:00
|
|
|
|
"""
|
|
|
|
|
res=extract_content_from_json(test_json_1)
|
|
|
|
|
print(res)
|