From 53f1b0fc3567179ca6a5621743e545139cc51353 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 18 Feb 2025 09:29:34 +0800 Subject: [PATCH] =?UTF-8?q?2.17=20=E5=B7=B2=E8=A7=A3=E5=86=B3=E5=86=85?= =?UTF-8?q?=E5=AD=98=E6=B3=84=E6=BC=8F=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 10 +++- flask_app/general/读取文件/按页读取pdf.py | 4 +- flask_app/test_case/find_zbfile.py | 39 ------------ flask_app/test_case/test_内存泄漏.py | 60 +++++++++++++++++++ .../{test_轮询.py => test_轮询负载均衡.py} | 0 5 files changed, 69 insertions(+), 44 deletions(-) delete mode 100644 flask_app/test_case/find_zbfile.py create mode 100644 flask_app/test_case/test_内存泄漏.py rename flask_app/test_case/{test_轮询.py => test_轮询负载均衡.py} (100%) diff --git a/README.md b/README.md index 74c7a76..fd3641a 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,9 @@ requirements.txt一般无需变动,除非代码中使用了新的库,也要 2. .env环境配好 (一般不需要在电脑环境变量中额外配置了,但是要在Pycharm中安装插件,使得项目能将env中的环境变量配置到系统环境变量中!!!) 3. 点击下拉框,Edit configurations - ![1](C:\Users\Administrator\Desktop\md_files\11.png) + ![1](md_files/11.png) -​ 设置run_serve.py为启动脚本![1](C:\Users\Administrator\Desktop\md_files\10.png) +​ 设置run_serve.py为启动脚本![1](md_files/10.png) ​ 注意这里的working directory要设置到最外层文件夹,而不是flask_app!!! 4. postman打post请求测试: @@ -85,6 +85,8 @@ bid-assistance/test 里面找个文件的url,推荐'094定稿-湖北工业大 1. 编写shell文件,sudo vim clean_dir.sh 命名为clean_dir.sh +清理/home/Z/zbparse_output_dev下的output1这些二级目录下的c8d2140d-9e9a-4a49-9a30-b53ba565db56这种uuid的三级目录(只保留最近7天)。 + ``` #!/bin/bash @@ -120,7 +122,7 @@ sudo chmod +x ./clean_dir.sh sudo ./clean_dir.sh ``` -4. 每天零点清理 +4. 以 root 用户的身份编辑 crontab 文件,从而设置或修改系统定时任务(cron jobs)。每天零点10分清理 ``` sudo crontab -e @@ -129,6 +131,8 @@ sudo crontab -e 10 0 * * * /home/Z/clean_dir.sh ``` +**目前测试服务器和正式服务器都写上了!无需变动** + ## flask_app结构介绍 diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 888d994..2076306 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -26,8 +26,8 @@ def extract_text_by_page_fitz(file_path): def extract_text_by_page(file_path): - common_header="" - # common_header = extract_common_header(file_path) + # common_header="" + common_header = extract_common_header(file_path) # print(common_header) result = "" with open(file_path, 'rb') as file: diff --git a/flask_app/test_case/find_zbfile.py b/flask_app/test_case/find_zbfile.py deleted file mode 100644 index c209b28..0000000 --- a/flask_app/test_case/find_zbfile.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -import shutil - -def find_and_copy_files(input_folder, output_folder): - # 确保输出文件夹存在 - os.makedirs(output_folder, exist_ok=True) - - # 定义支持的文件格式 - supported_formats = ('.pdf', '.doc', '.docx') - - # 遍历输入文件夹 - for root, dirs, files in os.walk(input_folder): - for file in files: - # 检查文件名是否包含“招标”或“竞争性”并且文件格式正确 - if ('响应' not in file and '投标' not in file) and \ - ('竞争性' in file or '招标文件' in file or '磋商' in file) and \ - file.endswith(supported_formats): - # 构造完整的文件路径 - file_path = os.path.join(root, file) - # 构造输出路径 - output_path = os.path.join(output_folder, file) - - # 检查输出路径是否存在同名文件 - unique_path = output_path - count = 1 - while os.path.exists(unique_path): - # 文件名前加上编号,格式如:filename(1).ext - base, extension = os.path.splitext(output_path) - unique_path = f"{base}({count}){extension}" - count += 1 - - # 复制文件到唯一的路径 - shutil.copy2(file_path, unique_path) - print(f"Copied '{file}' to '{unique_path}'.") - -# 使用示例 -input_folder = 'Z:\\货物类tb\\投标项目资料' # 输入文件夹路径 -output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles' # 输出文件夹路径 -find_and_copy_files(input_folder, output_folder) diff --git a/flask_app/test_case/test_内存泄漏.py b/flask_app/test_case/test_内存泄漏.py new file mode 100644 index 0000000..144be9a --- /dev/null +++ b/flask_app/test_case/test_内存泄漏.py @@ -0,0 +1,60 @@ +# test_内存泄漏.py +from memory_profiler import memory_usage +import time +from flask_app.general.读取文件.按页读取pdf import read_pdf_main +from flask_app.general.通用功能函数 import get_global_logger +from flask_app.routes.货物标解析main import preprocess_file_main + +def process_data(): + # 模拟一些占用内存的操作 + data = [i for i in range(1000000)] # 建一个1000万元素的list + time.sleep(1) # 假装处理2秒 + return len(data) + +def call(): + a = 1 + process_data() + return a + +def process_pdf(pdf_path): + """子进程里实际执行的函数""" + result = read_pdf_main(pdf_path=pdf_path) + return result +def main(): + for i in range(3): + logger = get_global_logger("123") + pdf_path1=r'C:\Users\Administrator\Desktop\工程标\招标test文件夹\zbtest13.pdf' + pdf_path2 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp\ztbfile.pdf' + pdf_path3= r'C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\tmp\ztbfile.pdf' + output_folder1 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp' + output_folder2 = r'C:\Users\Administrator\Desktop\fsdownload\4fb2f541-29c3-497f-9f0a-5216a10591a1\tmp' + output_folder3=r'C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\tmp' + mem_before = memory_usage()[0] + if i % 3 == 0: + output_folder = output_folder1 + pdf_path = pdf_path1 + elif i %3 ==1: + output_folder = output_folder2 + pdf_path = pdf_path2 + else: + output_folder = output_folder3 + pdf_path = pdf_path3 + result=preprocess_file_main(output_folder, pdf_path, 2,logger) + process_pdf(pdf_path) + response_data = { + 'result': { + 'procurement_spec': result['procurement_path'], + 'evaluation_method': result['evaluation_method_path'], + 'qualification_docs': result['qualification_path'], + 'notice_docs': result['notice_path'], + 'clause_details': result['clause_path'], + 'merged_baseinfo': result['merged_baseinfo_path'] + } + } + print(response_data) + mem_after = memory_usage()[0] + print(f"Memory before: {mem_before} MiB, Memory after: {mem_after} MiB") + + +if __name__ == '__main__': + main() diff --git a/flask_app/test_case/test_轮询.py b/flask_app/test_case/test_轮询负载均衡.py similarity index 100% rename from flask_app/test_case/test_轮询.py rename to flask_app/test_case/test_轮询负载均衡.py