From b25592da4d421703376a7bc547294cfd8ce3b11f Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 18 Feb 2025 15:45:07 +0800 Subject: [PATCH] =?UTF-8?q?2.18=20=E7=A9=BA=E7=9A=84docx=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E8=A2=AB=E8=AF=86=E5=88=AB=E4=B8=BA=E9=9D=9E=E6=8B=9B=E6=A0=87?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/llm/大模型通用函数.py | 3 +- flask_app/general/llm/通义千问long.py | 4 +-- flask_app/general/merge_pdfs.py | 2 -- .../general/投标人须知正文提取指定内容.py | 2 +- flask_app/logger_setup.py | 8 ++--- flask_app/test_case/test_内存泄漏.py | 1 + flask_app/test_case/test_内存泄漏2.py | 30 +++++++++++++++++++ 7 files changed, 39 insertions(+), 11 deletions(-) create mode 100644 flask_app/test_case/test_内存泄漏2.py diff --git a/flask_app/general/llm/大模型通用函数.py b/flask_app/general/llm/大模型通用函数.py index e83c99c..5dd4b64 100644 --- a/flask_app/general/llm/大模型通用函数.py +++ b/flask_app/general/llm/大模型通用函数.py @@ -160,8 +160,7 @@ def generate_full_user_query(file_path, prompt_template): return user_query if __name__ == "__main__": - error_msg='''Error code: 400 - {'error': {'code': 'RequestTimeOut', 'param': None, 'message': 'Request timed out, please try again later.', 'type': 'RequestTimeOut'}, 'id': 'chatcmpl-a92698d3-4e21-9243-b900-e32d4df4ac49', 'request_id': 'a92698d3-4e21-9243-b900-e32d4df4ac49'} - ''' + error_msg='''openai.BadRequestError: Error code: 400 - {'error': {'code': 'invalid_parameter_error', 'param': None, 'message': 'File parsing in progress, please try again later.', 'type': 'invalid_request_error'}, 'id': 'chatcmpl-018e2b26-b3eb-907e-be6a-6ba8fccee86f', 'request_id': '018e2b26-b3eb-907e-be6a-6ba8fccee86f'}''' error_code, error_code_string, request_id=extract_error_details(error_msg) print(error_code) print(error_code_string) diff --git a/flask_app/general/llm/通义千问long.py b/flask_app/general/llm/通义千问long.py index a59d745..836f8d3 100644 --- a/flask_app/general/llm/通义千问long.py +++ b/flask_app/general/llm/通义千问long.py @@ -99,7 +99,7 @@ def qianwen_long(file_id, user_query, max_retries=2, backoff_factor=1.0, need_ex break elif error_code == 400 and error_code_string in [ 'data_inspection_failed', 'ResponseTimeout', 'DataInspectionFailed', - 'response_timeout', 'request_timeout', "RequestTimeOut" + 'response_timeout', 'request_timeout', "RequestTimeOut","invalid_parameter_error" ]: logger.warning(f"错误代码为 400 - {error_code_string},将调用 qianwen_long_stream 执行一次...") try: @@ -219,7 +219,7 @@ def qianwen_long_stream(file_id, user_query, max_retries=2, backoff_factor=1.0, break elif error_code == 400 and error_code_string in [ 'data_inspection_failed', 'ResponseTimeout', 'DataInspectionFailed', - 'response_timeout', 'request_timeout', "RequestTimeOut" + 'response_timeout', 'request_timeout', "RequestTimeOut","invalid_parameter_error" ]: if attempt == 1: # 只重试一次 logger.warning(f"错误代码为 400 - {error_code_string},将立即重试...") diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index e8712ea..b103b29 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -200,8 +200,6 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na if matching_files: matching_files_sorted = sorted(matching_files) all_pdfs_to_merge.extend(matching_files_sorted) - for f in matching_files_sorted: - print(f"选中文件: {f}") else: if required: print(f"没有找到以 '{suffix}' 结尾的文件。") diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index 1ce7881..9ff6511 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -308,7 +308,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection): } """, 3: """ - 该招标文件中重新招标(或重新采购)、不再招标(或不再采购)、终止招标(或终止采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。 + 该招标文件中重新招标(或重新采购)、不再招标(或不再采购)、终止招标(或终止采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该基于实际的招标文件内容,不得擅自总结删减,更不得回答示例输出中的内容,如果原文中未提及相关内容,在相应键值中填'未知'。 示例输出如下,仅供格式参考: { "重新招标":"有下列情形之一的,招标人将重新招标:(1)投标截止时间止,投标人少于3个的;(2)经评标委员会评审后否决所有投标的;", diff --git a/flask_app/logger_setup.py b/flask_app/logger_setup.py index ba5c3e3..4d41753 100644 --- a/flask_app/logger_setup.py +++ b/flask_app/logger_setup.py @@ -66,10 +66,10 @@ def create_logger(app, subfolder): file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(logging.Formatter('%(message)s')) - logger.addHandler(stream_handler) - logger.setLevel(logging.INFO) + # stream_handler = logging.StreamHandler() + # stream_handler.setFormatter(logging.Formatter('%(message)s')) + # logger.addHandler(stream_handler) + logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL),而 DEBUG 级别的消息会被忽略。 logger.propagate = False g.logger = logger g.output_folder = output_folder #输出文件夹路径 diff --git a/flask_app/test_case/test_内存泄漏.py b/flask_app/test_case/test_内存泄漏.py index 144be9a..f6225b2 100644 --- a/flask_app/test_case/test_内存泄漏.py +++ b/flask_app/test_case/test_内存泄漏.py @@ -20,6 +20,7 @@ def process_pdf(pdf_path): """子进程里实际执行的函数""" result = read_pdf_main(pdf_path=pdf_path) return result + def main(): for i in range(3): logger = get_global_logger("123") diff --git a/flask_app/test_case/test_内存泄漏2.py b/flask_app/test_case/test_内存泄漏2.py new file mode 100644 index 0000000..fa59fa2 --- /dev/null +++ b/flask_app/test_case/test_内存泄漏2.py @@ -0,0 +1,30 @@ +import tracemalloc +from PyPDF2 import PdfReader + +def extract_text_by_page(file_path): + result = "" + with open(file_path, 'rb') as file: + reader =PdfReader(file) + num_pages = len(reader.pages) + # print(f"Total pages: {num_pages}") + for page_num in range(num_pages): + page = reader.pages[page_num] + text = page.extract_text() + return result + +# 开始跟踪内存分配 +tracemalloc.start() +# 捕捉函数调用前的内存快照 +snapshot_before = tracemalloc.take_snapshot() +# 调用函数 +file_path=r'C:\Users\Administrator\Desktop\fsdownload\00550cfc-fd33-469e-8272-9215291b175c\ztbfile.pdf' +result = extract_text_by_page(file_path) +# 捕捉函数调用后的内存快照 +snapshot_after = tracemalloc.take_snapshot() +# 比较两个快照,获取内存分配差异信息 +stats = snapshot_after.compare_to(snapshot_before, 'lineno') +print("[ Top 10 内存变化 ]") +for stat in stats[:10]: + print(stat) +# 停止内存分配跟踪 +tracemalloc.stop()