2.17 已解决内存泄漏问题

2025-02-18 09:29:34 +08:00 · 2025-02-18 09:29:34 +08:00 · 53f1b0fc35
commit 53f1b0fc35
parent 5f13665739
5 changed files with 69 additions and 44 deletions
--- a/README.md
+++ b/README.md
@ -55,9 +55,9 @@ requirements.txt一般无需变动，除非代码中使用了新的库，也要
 2. .env环境配好 （一般不需要在电脑环境变量中额外配置了，但是要在Pycharm中安装插件，使得项目能将env中的环境变量配置到系统环境变量中！！！）
 3. 点击下拉框，Edit configurations

- ![1](C:\Users\Administrator\Desktop\md_files\11.png)
+ ![1](md_files/11.png)

-	设置run_serve.py为启动脚本![1](C:\Users\Administrator\Desktop\md_files\10.png)
+	设置run_serve.py为启动脚本![1](md_files/10.png)
 	注意这里的working directory要设置到最外层文件夹，而不是flask_app！！！

 4. postman打post请求测试：
@ -85,6 +85,8 @@ bid-assistance/test 里面找个文件的url，推荐'094定稿-湖北工业大
 1. 编写shell文件，sudo vim clean_dir.sh
   命名为clean_dir.sh

+清理/home/Z/zbparse_output_dev下的output1这些二级目录下的c8d2140d-9e9a-4a49-9a30-b53ba565db56这种uuid的三级目录（只保留最近7天）。
+
 ```
 #!/bin/bash

@ -120,7 +122,7 @@ sudo chmod +x ./clean_dir.sh
 sudo ./clean_dir.sh
 ```

-4. 每天零点清理
+4. 以 root 用户的身份编辑 crontab 文件，从而设置或修改系统定时任务（cron jobs）。每天零点10分清理

 ```
 sudo crontab -e
@ -129,6 +131,8 @@ sudo crontab -e
 10 0 * * * /home/Z/clean_dir.sh
 ```

+**目前测试服务器和正式服务器都写上了！无需变动**
+


 ## flask_app结构介绍
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -26,8 +26,8 @@ def extract_text_by_page_fitz(file_path):


 def extract_text_by_page(file_path):
-    common_header=""
-    # common_header = extract_common_header(file_path)
+    # common_header=""
+    common_header = extract_common_header(file_path)
    # print(common_header)
    result = ""
    with open(file_path, 'rb') as file:
--- a/flask_app/test_case/find_zbfile.py
+++ b/flask_app/test_case/find_zbfile.py
@ -1,39 +0,0 @@
-import os
-import shutil
-
-def find_and_copy_files(input_folder, output_folder):
-    # 确保输出文件夹存在
-    os.makedirs(output_folder, exist_ok=True)
-
-    # 定义支持的文件格式
-    supported_formats = ('.pdf', '.doc', '.docx')
-
-    # 遍历输入文件夹
-    for root, dirs, files in os.walk(input_folder):
-        for file in files:
-            # 检查文件名是否包含“招标”或“竞争性”并且文件格式正确
-            if ('响应' not in file and '投标' not in file) and \
-                    ('竞争性' in file or '招标文件' in file or '磋商' in file) and \
-                    file.endswith(supported_formats):
-                # 构造完整的文件路径
-                file_path = os.path.join(root, file)
-                # 构造输出路径
-                output_path = os.path.join(output_folder, file)
-
-                # 检查输出路径是否存在同名文件
-                unique_path = output_path
-                count = 1
-                while os.path.exists(unique_path):
-                    # 文件名前加上编号，格式如：filename(1).ext
-                    base, extension = os.path.splitext(output_path)
-                    unique_path = f"{base}({count}){extension}"
-                    count += 1
-
-                # 复制文件到唯一的路径
-                shutil.copy2(file_path, unique_path)
-                print(f"Copied '{file}' to '{unique_path}'.")
-
-# 使用示例
-input_folder = 'Z:\\货物类tb\\投标项目资料'  # 输入文件夹路径
-output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles'  # 输出文件夹路径
-find_and_copy_files(input_folder, output_folder)
--- a/flask_app/test_case/test_内存泄漏.py
+++ b/flask_app/test_case/test_内存泄漏.py
@ -0,0 +1,60 @@
+# test_内存泄漏.py
+from memory_profiler import memory_usage
+import time
+from flask_app.general.读取文件.按页读取pdf import read_pdf_main
+from flask_app.general.通用功能函数 import get_global_logger
+from flask_app.routes.货物标解析main import preprocess_file_main
+
+def process_data():
+    # 模拟一些占用内存的操作
+    data = [i for i in range(1000000)]  # 建一个1000万元素的list
+    time.sleep(1)  # 假装处理2秒
+    return len(data)
+
+def call():
+    a = 1
+    process_data()
+    return a
+
+def process_pdf(pdf_path):
+    """子进程里实际执行的函数"""
+    result = read_pdf_main(pdf_path=pdf_path)
+    return result
+def main():
+    for i in range(3):
+        logger = get_global_logger("123")
+        pdf_path1=r'C:\Users\Administrator\Desktop\工程标\招标test文件夹\zbtest13.pdf'
+        pdf_path2 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp\ztbfile.pdf'
+        pdf_path3=  r'C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\tmp\ztbfile.pdf'
+        output_folder1 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp'
+        output_folder2 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp'
+        output_folder3=r'C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\tmp'
+        mem_before = memory_usage()[0]
+        if i % 3 == 0:
+            output_folder = output_folder1
+            pdf_path = pdf_path1
+        elif i %3 ==1:
+            output_folder = output_folder2
+            pdf_path = pdf_path2
+        else:
+            output_folder = output_folder3
+            pdf_path = pdf_path3
+        result=preprocess_file_main(output_folder, pdf_path, 2,logger)
+        process_pdf(pdf_path)
+        response_data = {
+            'result': {
+                'procurement_spec': result['procurement_path'],
+                'evaluation_method': result['evaluation_method_path'],
+                'qualification_docs': result['qualification_path'],
+                'notice_docs': result['notice_path'],
+                'clause_details': result['clause_path'],
+                'merged_baseinfo': result['merged_baseinfo_path']
+            }
+        }
+        print(response_data)
+        mem_after = memory_usage()[0]
+        print(f"Memory before: {mem_before} MiB, Memory after: {mem_after} MiB")
+
+
+if __name__ == '__main__':
+    main()
--- a/flask_app/test_case/test_轮询负载均衡.py
+++ b/flask_app/test_case/test_轮询负载均衡.py