12.19 invalid_path转md格式前增加判断

This commit is contained in:
zy123 2024-12-19 15:12:42 +08:00
parent 27830d271c
commit e19aaa04f6
2 changed files with 44 additions and 9 deletions

View File

@ -7,7 +7,7 @@ from flask_app.general.doubao import read_txt_to_string, pdf2txt
from flask_app.general.json_utils import combine_json_results, clean_json_string
from flask_app.general.通义千问long import upload_file, qianwen_long_stream
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf
from flask_app.general.format_change import docx2pdf, pdf2docx
import concurrent.futures
from flask_app.general.doubao import doubao_model
@ -255,26 +255,60 @@ def generate_template(required_keys,full_text, type=1):
user_query_template += f"\n\n文件内容:{full_text}"
return user_query_template
def get_business_requirements(procurement_path,processed_filepath):
required_keys = ["\s*术\s*要\s*求", "\s*务\s*要\s*求", "\s*务\s*要\s*求", "\s*他\s*要\s*求","\s*体\s*要\s*求","\s*设\s*要\s*求","\s*度\s*要\s*求","\s*期\s*要\s*求","\s*保\s*要\s*求","\s*训\s*要\s*求","\s*后\s*要\s*求"]
procurement_pdf_path=procurement_path
def get_business_requirements(procurement_path, processed_filepath, model_type):
required_keys = ["\s*术\s*要\s*求", "\s*务\s*要\s*求", "\s*务\s*要\s*求", "\s*他\s*要\s*求",
"\s*体\s*要\s*求", "\s*设\s*要\s*求", "\s*度\s*要\s*求", "\s*期\s*要\s*求",
"\s*保\s*要\s*求", "\s*训\s*要\s*求", "\s*后\s*要\s*求"]
# 将 doc/docx 转换为 pdf
procurement_pdf_path = procurement_path
if procurement_path.lower().endswith(('.doc', '.docx')):
procurement_pdf_path = docx2pdf(procurement_path)
# 查找包含的关键词
contained_keys = find_exists(procurement_pdf_path, required_keys)
print(contained_keys)
if not contained_keys:
return {}
# queries = generate_queries(truncate_file, contained_keys)
# 读取文件全文
full_text = read_txt_to_string(processed_filepath)
# 生成业务查询和技术查询
busi_user_query = generate_template(contained_keys, full_text, 1)
tech_user_query = generate_template(contained_keys, full_text, 2)
final_res={}
# 初始化结果存储
final_res = {}
# 如果是非模型调用,需要提前上传文件并获取 file_id
file_id = None
if not model_type:
procurement_docx_path=procurement_path
if procurement_path.lower().endswith('.pdf'):
procurement_docx_path = pdf2docx(procurement_path)
file_id = upload_file(procurement_docx_path) # 只上传一次文件,避免冗余调用
# 并行处理业务和技术查询
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = []
if busi_user_query:
futures.append(executor.submit(doubao_model, busi_user_query))
if model_type:
# 如果是模型调用,直接使用 doubao_model
futures.append(executor.submit(doubao_model, busi_user_query))
else:
# 使用 qianwen_long_stream 并传入 file_id
futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))
if tech_user_query:
futures.append(executor.submit(doubao_model, tech_user_query))
if model_type:
# 如果是模型调用,直接使用 doubao_model
futures.append(executor.submit(doubao_model, tech_user_query))
else:
# 使用 qianwen_long_stream 并传入 file_id
futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))
# 获取结果
for future in concurrent.futures.as_completed(futures):
try:
@ -283,6 +317,7 @@ def get_business_requirements(procurement_path,processed_filepath):
final_res.update(clean_json_string(result))
except Exception as e:
print(f"An error occurred: {e}")
return final_res

View File

@ -47,7 +47,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
# 提交任务给线程池
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
time.sleep(0.5) # 保持原有的延时
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath,model_type)
# 获取并行任务的结果
technical_requirements = future_technical.result()
business_requirements = future_business.result()