12.4
This commit is contained in:
parent
d8f7718511
commit
6698c37235
@ -2,6 +2,8 @@ import os
|
||||
|
||||
import PyPDF2
|
||||
import requests
|
||||
from ratelimit import sleep_and_retry, limits
|
||||
|
||||
from flask_app.general.file2markdown import convert_pdf_to_markdown
|
||||
|
||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||
@ -58,7 +60,8 @@ def read_txt_to_string(file_path):
|
||||
except Exception as e:
|
||||
return f"错误:读取文件时发生错误。详细信息:{e}"
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=10, period=1) # 每秒最多调用4次
|
||||
def doubao_model(full_user_query):
|
||||
print("call doubao...")
|
||||
# 相关参数
|
||||
|
@ -187,12 +187,25 @@ def restructure_data(data):
|
||||
def get_prefixes(s):
|
||||
prefixes = []
|
||||
for i in range(len(s)):
|
||||
if s[i] == ':':
|
||||
if s[i] in [':', ':']:
|
||||
prefixes.append(s[:i+1])
|
||||
return prefixes
|
||||
|
||||
# 定义删除公共前缀的函数
|
||||
def remove_common_prefixes(string_list):
|
||||
def remove_common_prefixes(string_list, min_occurrence=3):
|
||||
"""
|
||||
删除列表中所有满足出现次数>= min_occurrence 的公共前缀。
|
||||
|
||||
Args:
|
||||
string_list (list): 字符串列表。
|
||||
min_occurrence (int): 前缀至少出现的次数。
|
||||
|
||||
Returns:
|
||||
list: 删除公共前缀后的字符串列表。
|
||||
"""
|
||||
if not string_list:
|
||||
return string_list
|
||||
|
||||
# 构建前缀到字符串集合的映射
|
||||
prefix_to_strings = {}
|
||||
for s in string_list:
|
||||
@ -202,19 +215,30 @@ def remove_common_prefixes(string_list):
|
||||
if prefix not in prefix_to_strings:
|
||||
prefix_to_strings[prefix] = set()
|
||||
prefix_to_strings[prefix].add(s)
|
||||
# 找出至少在两个字符串中出现的前缀
|
||||
prefixes_occuring_in_multiple_strings = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >=2]
|
||||
# 对每个字符串,找到其匹配的最长前缀并删除
|
||||
|
||||
# 找出所有出现次数 >= min_occurrence 的前缀
|
||||
qualifying_prefixes = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >= min_occurrence]
|
||||
|
||||
if not qualifying_prefixes:
|
||||
# 没有满足条件的公共前缀,返回原列表
|
||||
return string_list
|
||||
|
||||
# 为了确保较长的前缀先被匹配,按长度降序排序
|
||||
qualifying_prefixes.sort(key=len, reverse=True)
|
||||
|
||||
# 对每个字符串,循环删除所有匹配的前缀
|
||||
new_string_list = []
|
||||
for s in string_list:
|
||||
applicable_prefixes = [prefix for prefix in prefixes_occuring_in_multiple_strings if s.startswith(prefix)]
|
||||
if applicable_prefixes:
|
||||
# 找到最长的前缀
|
||||
longest_prefix = max(applicable_prefixes, key=len)
|
||||
# 删除前缀
|
||||
new_s = s[len(longest_prefix):]
|
||||
new_string_list.append(new_s)
|
||||
else:
|
||||
original_s = s
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
for prefix in qualifying_prefixes:
|
||||
if s.startswith(prefix):
|
||||
s = s[len(prefix):]
|
||||
changed = True
|
||||
# 一旦删除一个前缀,重新开始检查,以处理可能的多个前缀
|
||||
break
|
||||
new_string_list.append(s)
|
||||
return new_string_list
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user