This commit is contained in:
zy123 2024-12-04 15:32:42 +08:00
parent d8f7718511
commit 6698c37235
2 changed files with 42 additions and 15 deletions

View File

@ -2,6 +2,8 @@ import os
import PyPDF2
import requests
from ratelimit import sleep_and_retry, limits
from flask_app.general.file2markdown import convert_pdf_to_markdown
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
@ -58,7 +60,8 @@ def read_txt_to_string(file_path):
except Exception as e:
return f"错误:读取文件时发生错误。详细信息:{e}"
@sleep_and_retry
@limits(calls=10, period=1) # 每秒最多调用4次
def doubao_model(full_user_query):
print("call doubao...")
# 相关参数

View File

@ -187,12 +187,25 @@ def restructure_data(data):
def get_prefixes(s):
prefixes = []
for i in range(len(s)):
if s[i] == '':
if s[i] in ['', ':']:
prefixes.append(s[:i+1])
return prefixes
# 定义删除公共前缀的函数
def remove_common_prefixes(string_list):
def remove_common_prefixes(string_list, min_occurrence=3):
"""
删除列表中所有满足出现次数>= min_occurrence 的公共前缀
Args:
string_list (list): 字符串列表
min_occurrence (int): 前缀至少出现的次数
Returns:
list: 删除公共前缀后的字符串列表
"""
if not string_list:
return string_list
# 构建前缀到字符串集合的映射
prefix_to_strings = {}
for s in string_list:
@ -202,20 +215,31 @@ def remove_common_prefixes(string_list):
if prefix not in prefix_to_strings:
prefix_to_strings[prefix] = set()
prefix_to_strings[prefix].add(s)
# 找出至少在两个字符串中出现的前缀
prefixes_occuring_in_multiple_strings = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >=2]
# 对每个字符串,找到其匹配的最长前缀并删除
# 找出所有出现次数 >= min_occurrence 的前缀
qualifying_prefixes = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >= min_occurrence]
if not qualifying_prefixes:
# 没有满足条件的公共前缀,返回原列表
return string_list
# 为了确保较长的前缀先被匹配,按长度降序排序
qualifying_prefixes.sort(key=len, reverse=True)
# 对每个字符串,循环删除所有匹配的前缀
new_string_list = []
for s in string_list:
applicable_prefixes = [prefix for prefix in prefixes_occuring_in_multiple_strings if s.startswith(prefix)]
if applicable_prefixes:
# 找到最长的前缀
longest_prefix = max(applicable_prefixes, key=len)
# 删除前缀
new_s = s[len(longest_prefix):]
new_string_list.append(new_s)
else:
new_string_list.append(s)
original_s = s
changed = True
while changed:
changed = False
for prefix in qualifying_prefixes:
if s.startswith(prefix):
s = s[len(prefix):]
changed = True
# 一旦删除一个前缀,重新开始检查,以处理可能的多个前缀
break
new_string_list.append(s)
return new_string_list
if __name__ == "__main__":