Merge branch 'develop-test' into develop-2.16

This commit is contained in:
zy123 2025-02-19 16:39:51 +08:00
commit 16bab601ca
7 changed files with 80 additions and 42 deletions

View File

@ -25,7 +25,7 @@ def extract_text_by_page_fitz(file_path):
return result return result
def extract_text_by_page(file_path): def extract_text_by_page_pypdf2(file_path):
# common_header="" # common_header=""
common_header = extract_common_header(file_path) common_header = extract_common_header(file_path)
# print(common_header) # print(common_header)
@ -109,7 +109,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
""" """
try: try:
# 提取文本内容 # 提取文本内容
extracted_text = extract_text_by_page(pdf_path) extracted_text = extract_text_by_page_pypdf2(pdf_path)
# 将提取的文本写入TXT文件 # 将提取的文本写入TXT文件
with open(txt_path, 'w', encoding='utf-8') as txt_file: with open(txt_path, 'w', encoding='utf-8') as txt_file:
@ -203,7 +203,7 @@ if __name__ == '__main__':
ress = extract_common_header(pdf_path) ress = extract_common_header(pdf_path)
print(ress) print(ress)
print("-----------------") print("-----------------")
extract_text_by_page(pdf_path) extract_text_by_page_pypdf2(pdf_path)
# res=extract_text_by_page_fitz(pdf_path) # res=extract_text_by_page_fitz(pdf_path)
# print(res)磋商文件_tobidders_notice_part2.pdf # print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt") # save_extracted_text_to_txt(file_path,"output.txt")

View File

@ -66,9 +66,9 @@ def create_logger(app, subfolder):
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter) file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler) logger.addHandler(file_handler)
# stream_handler = logging.StreamHandler() stream_handler = logging.StreamHandler()
# stream_handler.setFormatter(logging.Formatter('%(message)s')) stream_handler.setFormatter(logging.Formatter('%(message)s'))
# logger.addHandler(stream_handler) logger.addHandler(stream_handler)
logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL而 DEBUG 级别的消息会被忽略。 logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL而 DEBUG 级别的消息会被忽略。
logger.propagate = False logger.propagate = False
g.logger = logger g.logger = logger

View File

@ -1,9 +1,9 @@
from flask_app.general.读取文件.按页读取pdf import extract_text_by_page from flask_app.general.读取文件.按页读取pdf import extract_text_by_page_pypdf2
def check_strings_in_pdf(file_path): def check_strings_in_pdf(file_path):
judge_list=['施工机械设备', '企业信息登记'] judge_list=['施工机械设备', '企业信息登记']
# Read text from PDF # Read text from PDF
text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF text = extract_text_by_page_pypdf2(file_path) # Assuming this returns all text from the PDF
full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text
# Initialize the questions list # Initialize the questions list

View File

@ -1,9 +1,10 @@
import multiprocessing
import os import os
import threading import threading
import time import time
from enum import Enum from enum import Enum
from typing import Any from typing import Any
from flask import Blueprint, g from flask import Blueprint, g, current_app
from flask_app.general.format_change import download_file from flask_app.general.format_change import download_file
from flask_app.routes.判断是否是招标文件 import judge_zbfile_exec from flask_app.routes.判断是否是招标文件 import judge_zbfile_exec
from flask_app.routes.utils import validate_and_setup_logger, create_response_normal, log_error_unique_id from flask_app.routes.utils import validate_and_setup_logger, create_response_normal, log_error_unique_id
@ -26,7 +27,18 @@ def judge_zbfile() -> Any: #判断是否是招标文件
unique_id = g.unique_id unique_id = g.unique_id
result = [None] # 用于存储结果的可变对象 result = [None] # 用于存储结果的可变对象
done = threading.Event() # 标志判断是否完成 done = threading.Event() # 标志判断是否完成
pool = current_app.process_pool # 使用全局的进程池
def judge_zbfile_exec_sub(file_path):
# with multiprocessing.Pool(processes=1) as pool:
# result = pool.apply(
# judge_zbfile_exec, # 你的实际执行函数
# args=(file_path,)
# )
result = pool.apply(
judge_zbfile_exec, # 你的实际执行函数
args=(file_path,)
)
return result
def wrapper() -> None: def wrapper() -> None:
""" """
包装整个 judge_zbfile 的函数逻辑 包装整个 judge_zbfile 的函数逻辑
@ -38,7 +50,6 @@ def judge_zbfile() -> Any: #判断是否是招标文件
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True) downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
if not downloaded_filepath or file_type == 4: if not downloaded_filepath or file_type == 4:
logger.error("下载地址不存在或不支持的文件类型!")
log_error_unique_id(unique_id, 4) log_error_unique_id(unique_id, 4)
result[0] = JudgeResult.ERROR result[0] = JudgeResult.ERROR
return return
@ -46,7 +57,7 @@ def judge_zbfile() -> Any: #判断是否是招标文件
logger.info(f"Local file path: {downloaded_filepath}") logger.info(f"Local file path: {downloaded_filepath}")
# 调用实际的判断函数 # 调用实际的判断函数
judge_result = judge_zbfile_exec(downloaded_filepath) judge_result = judge_zbfile_exec_sub(downloaded_filepath)
judge = JudgeResult.YES if judge_result else JudgeResult.NO judge = JudgeResult.YES if judge_result else JudgeResult.NO
end_time = time.time() end_time = time.time()

View File

@ -1,7 +1,7 @@
import time import time
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2 from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
from docx import Document from docx import Document
import fitz
from flask_app.general.llm.通义千问long import upload_file, qianwen_long from flask_app.general.llm.通义千问long import upload_file, qianwen_long
def judge_zbfile_exec(file_path): def judge_zbfile_exec(file_path):
@ -12,13 +12,22 @@ def judge_zbfile_exec(file_path):
start_time = time.time() start_time = time.time()
# 检查文件是否为PDF格式 # 检查文件是否为PDF格式
if file_path.lower().endswith('.pdf'): if file_path.lower().endswith('.pdf'):
# 使用 with 语句确保文件关闭 try:
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
reader = PdfReader(f) reader = PdfReader(f)
num_pages = len(reader.pages) num_pages = len(reader.pages)
except Exception:
try:
doc = fitz.open(file_path)
num_pages = len(doc)
except Exception:
print("PDF 文件读取失败")
return False # 两种解析方式都失败,直接返回 False
if num_pages <= 5: if num_pages <= 5:
return False return False # 小于等于 5 页的 PDF 直接判定为非招标文件
elif file_path.lower().endswith('.docx'): elif file_path.lower().endswith('.docx'):
try:
doc = Document(file_path) doc = Document(file_path)
accumulated_text = "" accumulated_text = ""
chunk_size = 10 # 每次读取10个段落 chunk_size = 10 # 每次读取10个段落
@ -28,12 +37,15 @@ def judge_zbfile_exec(file_path):
chunk = paragraphs[i:i + chunk_size] chunk = paragraphs[i:i + chunk_size]
for para in chunk: for para in chunk:
accumulated_text += para.text accumulated_text += para.text
# 判断累计字符数是否已经达到1000字,
if len(accumulated_text) >= 1000: if len(accumulated_text) >= 1000:
break break # 读取超过1000字后即可停止
# 若累计内容不足1000字则直接返回False
if len(accumulated_text) < 1000: if len(accumulated_text) < 1000:
return False return False # 若累计内容不足1000字则直接返回 False
except Exception:
print("DOCX 文件读取失败,可能为乱码文件")
return False # 解析失败直接返回 False
pre_endtime=time.time()
print(f"judge_zbfile_exec预处理耗时{pre_endtime - start_time:.2f}")
# 使用大模型进行判断 # 使用大模型进行判断
user_query = """该文件是否属于招标文件?如果是的话,请返回'',如果不是的话,返回''。请不要返回其他解释或内容。 user_query = """该文件是否属于招标文件?如果是的话,请返回'',如果不是的话,返回''。请不要返回其他解释或内容。
以下是常见的招标文件类型 以下是常见的招标文件类型
@ -48,7 +60,7 @@ def judge_zbfile_exec(file_path):
model_res = qianwen_long(file_id, user_query) model_res = qianwen_long(file_id, user_query)
end_time = time.time() end_time = time.time()
print(f"judge_zbfile_exec实际耗时{end_time - start_time:.2f}") print(f"judge_zbfile_exec实际耗时{end_time - start_time:.2f}")
print(f"判断是否属于招标文件:{model_res}") print(f"判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f}")
return '' not in model_res return '' not in model_res
@ -58,7 +70,7 @@ def judge_zbfile_exec(file_path):
if __name__ == '__main__': if __name__ == '__main__':
start_time = time.time() start_time = time.time()
pdf_path = r"C:\Users\Administrator\Downloads\file1739842556194.docx" pdf_path = r"C:\Users\Administrator\Desktop\fsdownload\19f53a17-ad4c-43b5-a7ed-981958ec3e0fs\ztbfile.docx"
res = judge_zbfile_exec(pdf_path) res = judge_zbfile_exec(pdf_path)
if res: if res:
print("yes") print("yes")

View File

@ -1,8 +1,24 @@
#flask_app/run_serve.py #flask_app/run_serve.py
import threading
import time
import requests
from waitress import serve from waitress import serve
from flask_app.start_up import create_app from flask_app.start_up import create_app
def warmup_request():
# 等待服务器完全启动,例如等待 1-2 秒
time.sleep(5)
try:
url = "http://127.0.0.1:5000/judge_zbfile"
#url必须为永久地址完成热启动创建进程池
payload = {"file_url": "https://intellbid-open.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf"} # 根据实际情况设置 file_url
headers = {"Content-Type": "application/json"}
response = requests.post(url, json=payload, headers=headers)
print(f"Warm-up 请求发送成功,状态码:{response.status_code}")
except Exception as e:
print(f"Warm-up 请求出错:{e}")
if __name__ == "__main__": if __name__ == "__main__":
app = create_app() app = create_app()
threading.Thread(target=warmup_request, daemon=True).start()
serve(app, host='0.0.0.0', port=5000) serve(app, host='0.0.0.0', port=5000)

View File

@ -1,6 +1,6 @@
# flask_app/start_up.py # flask_app/start_up.py
import gc from concurrent.futures import ProcessPoolExecutor
import os from multiprocessing import Pool
from flask import Flask, g from flask import Flask, g
from flask_app.ConnectionLimiter import ConnectionLimiter from flask_app.ConnectionLimiter import ConnectionLimiter
@ -13,11 +13,10 @@ from flask_app.routes.test_zbparse import test_zbparse_bp
from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
from flask_app.routes.judge_zbfile import judge_zbfile_bp from flask_app.routes.judge_zbfile import judge_zbfile_bp
from flask_app.routes.test_preprocess import test_process_bp from flask_app.routes.test_preprocess import test_process_bp
def create_app(): def create_app():
# 创建全局日志记录器 # 创建全局日志记录器
app = Flask(__name__) app = Flask(__name__)
app.process_pool = Pool(processes=10, maxtasksperchild=3)
app.global_logger = create_logger_main('model_log') # 全局日志记录器 app.global_logger = create_logger_main('model_log') # 全局日志记录器
# 注册蓝图 # 注册蓝图
app.register_blueprint(get_deviation_bp) app.register_blueprint(get_deviation_bp)