diff --git a/flask_app/general/doubao.py b/flask_app/general/doubao.py index 30491d2..fe837dc 100644 --- a/flask_app/general/doubao.py +++ b/flask_app/general/doubao.py @@ -2,6 +2,8 @@ import os import PyPDF2 import requests +from ratelimit import sleep_and_retry, limits + from flask_app.general.file2markdown import convert_pdf_to_markdown from flask_app.general.clean_pdf import extract_common_header, clean_page_content @@ -58,7 +60,8 @@ def read_txt_to_string(file_path): except Exception as e: return f"错误:读取文件时发生错误。详细信息:{e}" - +@sleep_and_retry +@limits(calls=10, period=1) # 每秒最多调用4次 def doubao_model(full_user_query): print("call doubao...") # 相关参数 diff --git a/flask_app/货物标/技术参数要求提取后处理函数.py b/flask_app/货物标/技术参数要求提取后处理函数.py index 4b72567..17ed4f2 100644 --- a/flask_app/货物标/技术参数要求提取后处理函数.py +++ b/flask_app/货物标/技术参数要求提取后处理函数.py @@ -187,12 +187,25 @@ def restructure_data(data): def get_prefixes(s): prefixes = [] for i in range(len(s)): - if s[i] == ':': + if s[i] in [':', ':']: prefixes.append(s[:i+1]) return prefixes # 定义删除公共前缀的函数 -def remove_common_prefixes(string_list): +def remove_common_prefixes(string_list, min_occurrence=3): + """ + 删除列表中所有满足出现次数>= min_occurrence 的公共前缀。 + + Args: + string_list (list): 字符串列表。 + min_occurrence (int): 前缀至少出现的次数。 + + Returns: + list: 删除公共前缀后的字符串列表。 + """ + if not string_list: + return string_list + # 构建前缀到字符串集合的映射 prefix_to_strings = {} for s in string_list: @@ -202,20 +215,31 @@ def remove_common_prefixes(string_list): if prefix not in prefix_to_strings: prefix_to_strings[prefix] = set() prefix_to_strings[prefix].add(s) - # 找出至少在两个字符串中出现的前缀 - prefixes_occuring_in_multiple_strings = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >=2] - # 对每个字符串,找到其匹配的最长前缀并删除 + + # 找出所有出现次数 >= min_occurrence 的前缀 + qualifying_prefixes = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >= min_occurrence] + + if not qualifying_prefixes: + # 没有满足条件的公共前缀,返回原列表 + return string_list + + # 为了确保较长的前缀先被匹配,按长度降序排序 + qualifying_prefixes.sort(key=len, reverse=True) + + # 对每个字符串,循环删除所有匹配的前缀 new_string_list = [] for s in string_list: - applicable_prefixes = [prefix for prefix in prefixes_occuring_in_multiple_strings if s.startswith(prefix)] - if applicable_prefixes: - # 找到最长的前缀 - longest_prefix = max(applicable_prefixes, key=len) - # 删除前缀 - new_s = s[len(longest_prefix):] - new_string_list.append(new_s) - else: - new_string_list.append(s) + original_s = s + changed = True + while changed: + changed = False + for prefix in qualifying_prefixes: + if s.startswith(prefix): + s = s[len(prefix):] + changed = True + # 一旦删除一个前缀,重新开始检查,以处理可能的多个前缀 + break + new_string_list.append(s) return new_string_list if __name__ == "__main__":