2.17 已解决内存泄漏问题

This commit is contained in:
zy123 2025-02-18 09:29:34 +08:00
parent 5f13665739
commit 53f1b0fc35
5 changed files with 69 additions and 44 deletions

View File

@ -55,9 +55,9 @@ requirements.txt一般无需变动除非代码中使用了新的库也要
2. .env环境配好 一般不需要在电脑环境变量中额外配置了但是要在Pycharm中安装插件使得项目能将env中的环境变量配置到系统环境变量中
3. 点击下拉框Edit configurations
![1](C:\Users\Administrator\Desktop\md_files\11.png)
![1](md_files/11.png)
设置run_serve.py为启动脚本![1](C:\Users\Administrator\Desktop\md_files\10.png)
设置run_serve.py为启动脚本![1](md_files/10.png)
注意这里的working directory要设置到最外层文件夹而不是flask_app
4. postman打post请求测试
@ -85,6 +85,8 @@ bid-assistance/test 里面找个文件的url推荐'094定稿-湖北工业大
1. 编写shell文件sudo vim clean_dir.sh
命名为clean_dir.sh
清理/home/Z/zbparse_output_dev下的output1这些二级目录下的c8d2140d-9e9a-4a49-9a30-b53ba565db56这种uuid的三级目录只保留最近7天
```
#!/bin/bash
@ -120,7 +122,7 @@ sudo chmod +x ./clean_dir.sh
sudo ./clean_dir.sh
```
4. 每天零点清理
4. 以 root 用户的身份编辑 crontab 文件从而设置或修改系统定时任务cron jobs每天零点10分清理
```
sudo crontab -e
@ -129,6 +131,8 @@ sudo crontab -e
10 0 * * * /home/Z/clean_dir.sh
```
**目前测试服务器和正式服务器都写上了!无需变动**
## flask_app结构介绍

View File

@ -26,8 +26,8 @@ def extract_text_by_page_fitz(file_path):
def extract_text_by_page(file_path):
common_header=""
# common_header = extract_common_header(file_path)
# common_header=""
common_header = extract_common_header(file_path)
# print(common_header)
result = ""
with open(file_path, 'rb') as file:

View File

@ -1,39 +0,0 @@
import os
import shutil
def find_and_copy_files(input_folder, output_folder):
# 确保输出文件夹存在
os.makedirs(output_folder, exist_ok=True)
# 定义支持的文件格式
supported_formats = ('.pdf', '.doc', '.docx')
# 遍历输入文件夹
for root, dirs, files in os.walk(input_folder):
for file in files:
# 检查文件名是否包含“招标”或“竞争性”并且文件格式正确
if ('响应' not in file and '投标' not in file) and \
('竞争性' in file or '招标文件' in file or '磋商' in file) and \
file.endswith(supported_formats):
# 构造完整的文件路径
file_path = os.path.join(root, file)
# 构造输出路径
output_path = os.path.join(output_folder, file)
# 检查输出路径是否存在同名文件
unique_path = output_path
count = 1
while os.path.exists(unique_path):
# 文件名前加上编号格式如filename(1).ext
base, extension = os.path.splitext(output_path)
unique_path = f"{base}({count}){extension}"
count += 1
# 复制文件到唯一的路径
shutil.copy2(file_path, unique_path)
print(f"Copied '{file}' to '{unique_path}'.")
# 使用示例
input_folder = 'Z:\\货物类tb\\投标项目资料' # 输入文件夹路径
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles' # 输出文件夹路径
find_and_copy_files(input_folder, output_folder)

View File

@ -0,0 +1,60 @@
# test_内存泄漏.py
from memory_profiler import memory_usage
import time
from flask_app.general.读取文件.按页读取pdf import read_pdf_main
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.routes.货物标解析main import preprocess_file_main
def process_data():
# 模拟一些占用内存的操作
data = [i for i in range(1000000)] # 建一个1000万元素的list
time.sleep(1) # 假装处理2秒
return len(data)
def call():
a = 1
process_data()
return a
def process_pdf(pdf_path):
"""子进程里实际执行的函数"""
result = read_pdf_main(pdf_path=pdf_path)
return result
def main():
for i in range(3):
logger = get_global_logger("123")
pdf_path1=r'C:\Users\Administrator\Desktop\工程标\招标test文件夹\zbtest13.pdf'
pdf_path2 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp\ztbfile.pdf'
pdf_path3= r'C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\tmp\ztbfile.pdf'
output_folder1 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp'
output_folder2 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp'
output_folder3=r'C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\tmp'
mem_before = memory_usage()[0]
if i % 3 == 0:
output_folder = output_folder1
pdf_path = pdf_path1
elif i %3 ==1:
output_folder = output_folder2
pdf_path = pdf_path2
else:
output_folder = output_folder3
pdf_path = pdf_path3
result=preprocess_file_main(output_folder, pdf_path, 2,logger)
process_pdf(pdf_path)
response_data = {
'result': {
'procurement_spec': result['procurement_path'],
'evaluation_method': result['evaluation_method_path'],
'qualification_docs': result['qualification_path'],
'notice_docs': result['notice_path'],
'clause_details': result['clause_path'],
'merged_baseinfo': result['merged_baseinfo_path']
}
}
print(response_data)
mem_after = memory_usage()[0]
print(f"Memory before: {mem_before} MiB, Memory after: {mem_after} MiB")
if __name__ == '__main__':
main()