Merge branch 'develop-test' into develop-2.16
This commit is contained in:
commit
16bab601ca
@ -25,7 +25,7 @@ def extract_text_by_page_fitz(file_path):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def extract_text_by_page(file_path):
|
def extract_text_by_page_pypdf2(file_path):
|
||||||
# common_header=""
|
# common_header=""
|
||||||
common_header = extract_common_header(file_path)
|
common_header = extract_common_header(file_path)
|
||||||
# print(common_header)
|
# print(common_header)
|
||||||
@ -109,7 +109,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 提取文本内容
|
# 提取文本内容
|
||||||
extracted_text = extract_text_by_page(pdf_path)
|
extracted_text = extract_text_by_page_pypdf2(pdf_path)
|
||||||
|
|
||||||
# 将提取的文本写入TXT文件
|
# 将提取的文本写入TXT文件
|
||||||
with open(txt_path, 'w', encoding='utf-8') as txt_file:
|
with open(txt_path, 'w', encoding='utf-8') as txt_file:
|
||||||
@ -203,7 +203,7 @@ if __name__ == '__main__':
|
|||||||
ress = extract_common_header(pdf_path)
|
ress = extract_common_header(pdf_path)
|
||||||
print(ress)
|
print(ress)
|
||||||
print("-----------------")
|
print("-----------------")
|
||||||
extract_text_by_page(pdf_path)
|
extract_text_by_page_pypdf2(pdf_path)
|
||||||
# res=extract_text_by_page_fitz(pdf_path)
|
# res=extract_text_by_page_fitz(pdf_path)
|
||||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
# print(res)磋商文件_tobidders_notice_part2.pdf
|
||||||
# save_extracted_text_to_txt(file_path,"output.txt")
|
# save_extracted_text_to_txt(file_path,"output.txt")
|
||||||
|
@ -66,9 +66,9 @@ def create_logger(app, subfolder):
|
|||||||
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
file_handler.setFormatter(file_formatter)
|
file_handler.setFormatter(file_formatter)
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
# stream_handler = logging.StreamHandler()
|
stream_handler = logging.StreamHandler()
|
||||||
# stream_handler.setFormatter(logging.Formatter('%(message)s'))
|
stream_handler.setFormatter(logging.Formatter('%(message)s'))
|
||||||
# logger.addHandler(stream_handler)
|
logger.addHandler(stream_handler)
|
||||||
logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL),而 DEBUG 级别的消息会被忽略。
|
logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL),而 DEBUG 级别的消息会被忽略。
|
||||||
logger.propagate = False
|
logger.propagate = False
|
||||||
g.logger = logger
|
g.logger = logger
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from flask_app.general.读取文件.按页读取pdf import extract_text_by_page
|
from flask_app.general.读取文件.按页读取pdf import extract_text_by_page_pypdf2
|
||||||
|
|
||||||
def check_strings_in_pdf(file_path):
|
def check_strings_in_pdf(file_path):
|
||||||
judge_list=['施工机械设备', '企业信息登记']
|
judge_list=['施工机械设备', '企业信息登记']
|
||||||
# Read text from PDF
|
# Read text from PDF
|
||||||
text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF
|
text = extract_text_by_page_pypdf2(file_path) # Assuming this returns all text from the PDF
|
||||||
full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text
|
full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text
|
||||||
|
|
||||||
# Initialize the questions list
|
# Initialize the questions list
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from flask import Blueprint, g
|
from flask import Blueprint, g, current_app
|
||||||
from flask_app.general.format_change import download_file
|
from flask_app.general.format_change import download_file
|
||||||
from flask_app.routes.判断是否是招标文件 import judge_zbfile_exec
|
from flask_app.routes.判断是否是招标文件 import judge_zbfile_exec
|
||||||
from flask_app.routes.utils import validate_and_setup_logger, create_response_normal, log_error_unique_id
|
from flask_app.routes.utils import validate_and_setup_logger, create_response_normal, log_error_unique_id
|
||||||
@ -26,7 +27,18 @@ def judge_zbfile() -> Any: #判断是否是招标文件
|
|||||||
unique_id = g.unique_id
|
unique_id = g.unique_id
|
||||||
result = [None] # 用于存储结果的可变对象
|
result = [None] # 用于存储结果的可变对象
|
||||||
done = threading.Event() # 标志判断是否完成
|
done = threading.Event() # 标志判断是否完成
|
||||||
|
pool = current_app.process_pool # 使用全局的进程池
|
||||||
|
def judge_zbfile_exec_sub(file_path):
|
||||||
|
# with multiprocessing.Pool(processes=1) as pool:
|
||||||
|
# result = pool.apply(
|
||||||
|
# judge_zbfile_exec, # 你的实际执行函数
|
||||||
|
# args=(file_path,)
|
||||||
|
# )
|
||||||
|
result = pool.apply(
|
||||||
|
judge_zbfile_exec, # 你的实际执行函数
|
||||||
|
args=(file_path,)
|
||||||
|
)
|
||||||
|
return result
|
||||||
def wrapper() -> None:
|
def wrapper() -> None:
|
||||||
"""
|
"""
|
||||||
包装整个 judge_zbfile 的函数逻辑
|
包装整个 judge_zbfile 的函数逻辑
|
||||||
@ -38,7 +50,6 @@ def judge_zbfile() -> Any: #判断是否是招标文件
|
|||||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
|
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
|
||||||
|
|
||||||
if not downloaded_filepath or file_type == 4:
|
if not downloaded_filepath or file_type == 4:
|
||||||
logger.error("下载地址不存在或不支持的文件类型!")
|
|
||||||
log_error_unique_id(unique_id, 4)
|
log_error_unique_id(unique_id, 4)
|
||||||
result[0] = JudgeResult.ERROR
|
result[0] = JudgeResult.ERROR
|
||||||
return
|
return
|
||||||
@ -46,7 +57,7 @@ def judge_zbfile() -> Any: #判断是否是招标文件
|
|||||||
logger.info(f"Local file path: {downloaded_filepath}")
|
logger.info(f"Local file path: {downloaded_filepath}")
|
||||||
|
|
||||||
# 调用实际的判断函数
|
# 调用实际的判断函数
|
||||||
judge_result = judge_zbfile_exec(downloaded_filepath)
|
judge_result = judge_zbfile_exec_sub(downloaded_filepath)
|
||||||
judge = JudgeResult.YES if judge_result else JudgeResult.NO
|
judge = JudgeResult.YES if judge_result else JudgeResult.NO
|
||||||
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import time
|
import time
|
||||||
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
|
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
import fitz
|
||||||
from flask_app.general.llm.通义千问long import upload_file, qianwen_long
|
from flask_app.general.llm.通义千问long import upload_file, qianwen_long
|
||||||
|
|
||||||
def judge_zbfile_exec(file_path):
|
def judge_zbfile_exec(file_path):
|
||||||
@ -12,28 +12,40 @@ def judge_zbfile_exec(file_path):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# 检查文件是否为PDF格式
|
# 检查文件是否为PDF格式
|
||||||
if file_path.lower().endswith('.pdf'):
|
if file_path.lower().endswith('.pdf'):
|
||||||
# 使用 with 语句确保文件关闭
|
try:
|
||||||
with open(file_path, 'rb') as f:
|
with open(file_path, 'rb') as f:
|
||||||
reader = PdfReader(f)
|
reader = PdfReader(f)
|
||||||
num_pages = len(reader.pages)
|
num_pages = len(reader.pages)
|
||||||
if num_pages <= 5:
|
except Exception:
|
||||||
return False
|
try:
|
||||||
elif file_path.lower().endswith('.docx'):
|
doc = fitz.open(file_path)
|
||||||
doc = Document(file_path)
|
num_pages = len(doc)
|
||||||
accumulated_text = ""
|
except Exception:
|
||||||
chunk_size = 10 # 每次读取10个段落
|
print("PDF 文件读取失败")
|
||||||
paragraphs = doc.paragraphs
|
return False # 两种解析方式都失败,直接返回 False
|
||||||
|
|
||||||
for i in range(0, len(paragraphs), chunk_size):
|
if num_pages <= 5:
|
||||||
chunk = paragraphs[i:i + chunk_size]
|
return False # 小于等于 5 页的 PDF 直接判定为非招标文件
|
||||||
for para in chunk:
|
elif file_path.lower().endswith('.docx'):
|
||||||
accumulated_text += para.text
|
try:
|
||||||
# 判断累计字符数是否已经达到1000字,
|
doc = Document(file_path)
|
||||||
if len(accumulated_text) >= 1000:
|
accumulated_text = ""
|
||||||
break
|
chunk_size = 10 # 每次读取10个段落
|
||||||
# 若累计内容不足1000字,则直接返回False
|
paragraphs = doc.paragraphs
|
||||||
if len(accumulated_text) < 1000:
|
|
||||||
return False
|
for i in range(0, len(paragraphs), chunk_size):
|
||||||
|
chunk = paragraphs[i:i + chunk_size]
|
||||||
|
for para in chunk:
|
||||||
|
accumulated_text += para.text
|
||||||
|
if len(accumulated_text) >= 1000:
|
||||||
|
break # 读取超过1000字后即可停止
|
||||||
|
if len(accumulated_text) < 1000:
|
||||||
|
return False # 若累计内容不足1000字,则直接返回 False
|
||||||
|
except Exception:
|
||||||
|
print("DOCX 文件读取失败,可能为乱码文件")
|
||||||
|
return False # 解析失败直接返回 False
|
||||||
|
pre_endtime=time.time()
|
||||||
|
print(f"judge_zbfile_exec预处理耗时:{pre_endtime - start_time:.2f} 秒")
|
||||||
# 使用大模型进行判断
|
# 使用大模型进行判断
|
||||||
user_query = """该文件是否属于招标文件?如果是的话,请返回'是',如果不是的话,返回'否'。请不要返回其他解释或内容。
|
user_query = """该文件是否属于招标文件?如果是的话,请返回'是',如果不是的话,返回'否'。请不要返回其他解释或内容。
|
||||||
以下是常见的招标文件类型:
|
以下是常见的招标文件类型:
|
||||||
@ -48,7 +60,7 @@ def judge_zbfile_exec(file_path):
|
|||||||
model_res = qianwen_long(file_id, user_query)
|
model_res = qianwen_long(file_id, user_query)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
||||||
print(f"判断是否属于招标文件:{model_res}")
|
print(f"判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒")
|
||||||
|
|
||||||
return '否' not in model_res
|
return '否' not in model_res
|
||||||
|
|
||||||
@ -58,7 +70,7 @@ def judge_zbfile_exec(file_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
pdf_path = r"C:\Users\Administrator\Downloads\file1739842556194.docx"
|
pdf_path = r"C:\Users\Administrator\Desktop\fsdownload\19f53a17-ad4c-43b5-a7ed-981958ec3e0fs\ztbfile.docx"
|
||||||
res = judge_zbfile_exec(pdf_path)
|
res = judge_zbfile_exec(pdf_path)
|
||||||
if res:
|
if res:
|
||||||
print("yes")
|
print("yes")
|
||||||
|
@ -1,8 +1,24 @@
|
|||||||
#flask_app/run_serve.py
|
#flask_app/run_serve.py
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
from waitress import serve
|
from waitress import serve
|
||||||
from flask_app.start_up import create_app
|
from flask_app.start_up import create_app
|
||||||
|
def warmup_request():
|
||||||
|
# 等待服务器完全启动,例如等待 1-2 秒
|
||||||
|
time.sleep(5)
|
||||||
|
try:
|
||||||
|
url = "http://127.0.0.1:5000/judge_zbfile"
|
||||||
|
#url必须为永久地址,完成热启动,创建进程池
|
||||||
|
payload = {"file_url": "https://intellbid-open.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf"} # 根据实际情况设置 file_url
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
response = requests.post(url, json=payload, headers=headers)
|
||||||
|
print(f"Warm-up 请求发送成功,状态码:{response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warm-up 请求出错:{e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app = create_app()
|
app = create_app()
|
||||||
serve(app, host='0.0.0.0', port=5000)
|
threading.Thread(target=warmup_request, daemon=True).start()
|
||||||
|
serve(app, host='0.0.0.0', port=5000)
|
@ -1,6 +1,6 @@
|
|||||||
# flask_app/start_up.py
|
# flask_app/start_up.py
|
||||||
import gc
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
import os
|
from multiprocessing import Pool
|
||||||
|
|
||||||
from flask import Flask, g
|
from flask import Flask, g
|
||||||
from flask_app.ConnectionLimiter import ConnectionLimiter
|
from flask_app.ConnectionLimiter import ConnectionLimiter
|
||||||
@ -13,11 +13,10 @@ from flask_app.routes.test_zbparse import test_zbparse_bp
|
|||||||
from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
|
from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
|
||||||
from flask_app.routes.judge_zbfile import judge_zbfile_bp
|
from flask_app.routes.judge_zbfile import judge_zbfile_bp
|
||||||
from flask_app.routes.test_preprocess import test_process_bp
|
from flask_app.routes.test_preprocess import test_process_bp
|
||||||
|
|
||||||
|
|
||||||
def create_app():
|
def create_app():
|
||||||
# 创建全局日志记录器
|
# 创建全局日志记录器
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
app.process_pool = Pool(processes=10, maxtasksperchild=3)
|
||||||
app.global_logger = create_logger_main('model_log') # 全局日志记录器
|
app.global_logger = create_logger_main('model_log') # 全局日志记录器
|
||||||
# 注册蓝图
|
# 注册蓝图
|
||||||
app.register_blueprint(get_deviation_bp)
|
app.register_blueprint(get_deviation_bp)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user