12.19 invalid_path转md格式前增加判断
This commit is contained in:
parent
5ec5b78c6a
commit
9c3c6d889e
@ -55,7 +55,7 @@ def convert_file_to_markdown(file_path):
|
||||
'markdown_details': 1,
|
||||
'apply_document_tree': 1,
|
||||
'dpi': 216, # 分辨率设置默认为144 dpi,
|
||||
'get_image':None
|
||||
'get_image':'none'
|
||||
})
|
||||
print("request time: ", resp.elapsed.total_seconds())
|
||||
data = json.loads(resp.text)
|
||||
|
@ -1,7 +1,11 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import concurrent.futures
|
||||
import json
|
||||
from flask_app.general.doubao import doubao_model
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.通义千问long import qianwen_long, qianwen_long_stream
|
||||
|
||||
|
||||
def generate_continue_query(original_query, original_answer):
|
||||
"""
|
||||
生成继续回答的查询内容。
|
||||
@ -25,13 +29,15 @@ def generate_continue_query(original_query, original_answer):
|
||||
return continue_query
|
||||
|
||||
|
||||
def continue_answer(original_query, original_answer):
|
||||
def continue_answer(original_query, original_answer, model_type=1, file_id=None):
|
||||
"""
|
||||
实现“继续回答”功能。
|
||||
实现“继续回答”功能,支持选择不同模型。
|
||||
|
||||
参数:
|
||||
- original_query (str): 原始问题。
|
||||
- original_answer (str): 上一次模型的回答(可能是不完整的 JSON 字符串)。
|
||||
- model_type (int): 指定使用的模型类型(1: doubao_model, 2: qianwen_long, 3: qianwen_long_stream)。
|
||||
- file_id (str): 可选的文件 ID,默认为 None。
|
||||
|
||||
返回:
|
||||
- json_data (dict): 拼接后的完整 JSON 数据。如果解析失败,返回 None。
|
||||
@ -40,10 +46,21 @@ def continue_answer(original_query, original_answer):
|
||||
continue_query = generate_continue_query(original_query, original_answer)
|
||||
print("继续问答")
|
||||
print(continue_query)
|
||||
# 调用模型获取继续的回答
|
||||
model_res = doubao_model(continue_query)
|
||||
|
||||
# print(model_res)
|
||||
# 根据模型类型选择调用的模型
|
||||
if model_type == 1: # 使用 doubao_model
|
||||
model_res = doubao_model(continue_query)
|
||||
elif model_type == 2: # 使用 qianwen_long
|
||||
if file_id is None:
|
||||
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")
|
||||
model_res = qianwen_long(file_id, continue_query)
|
||||
elif model_type == 3: # 使用 qianwen_long_stream
|
||||
if file_id is None:
|
||||
raise ValueError("file_id 必须在使用 qianwen_long_stream 模型时提供!")
|
||||
model_res = qianwen_long_stream(file_id, continue_query)
|
||||
else:
|
||||
raise ValueError(f"无效的模型类型: {model_type}")
|
||||
|
||||
# 拼接原始回答和模型的继续回答
|
||||
# 删除 original_answer 的最右边的 `"` 和 `\n`
|
||||
clean_original = original_answer.rstrip('"\n')
|
||||
@ -68,6 +85,45 @@ def continue_answer(original_query, original_answer):
|
||||
print(e)
|
||||
return None
|
||||
|
||||
def process_continue_answers(questions_to_continue, model_type, file_id):
|
||||
"""
|
||||
并行处理需要调用 `continue_answer` 的问题。
|
||||
|
||||
参数:
|
||||
- questions_to_continue (list of tuples): 需要继续回答的问题,每个元素是 (original_query, parsed_answer)。
|
||||
- model_type (int): 指定使用的模型类型。
|
||||
- file_id (str): 可选的文件 ID,默认为 None。
|
||||
|
||||
返回:
|
||||
- dict: 继续回答后的结果合并。
|
||||
"""
|
||||
continued_results = {}
|
||||
if not questions_to_continue:
|
||||
return continued_results
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(questions_to_continue))) as executor:
|
||||
future_to_question = {
|
||||
executor.submit(
|
||||
continue_answer,
|
||||
original_query,
|
||||
parsed_answer,
|
||||
model_type,
|
||||
None if model_type == 1 else file_id
|
||||
): original_query
|
||||
for original_query, parsed_answer in questions_to_continue
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(future_to_question):
|
||||
original_query = future_to_question[future]
|
||||
try:
|
||||
continued_result = future.result()
|
||||
if continued_result: # 确保结果不为空
|
||||
continued_results.update(continued_result)
|
||||
except Exception as e:
|
||||
print(f"在处理问题 '{original_query}' 时发生错误: {e}") #TODO:排查一下
|
||||
|
||||
return continued_results
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
original_query = """请根据货物标中采购要求部分的内容,告诉我"▲乘客电梯"的技术参数或采购要求是什么。由于该货物存在 4 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为"▲乘客电梯-1";键值为一个列表,列表中包含若干描述"▲乘客电梯"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。
|
||||
|
@ -5,6 +5,7 @@ from PyPDF2 import PdfReader
|
||||
import textwrap
|
||||
from flask_app.general.doubao import read_txt_to_string, pdf2txt
|
||||
from flask_app.general.json_utils import combine_json_results, clean_json_string
|
||||
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long_stream
|
||||
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
|
||||
from flask_app.general.format_change import docx2pdf, pdf2docx
|
||||
@ -291,33 +292,54 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
|
||||
file_id = upload_file(procurement_docx_path) # 只上传一次文件,避免冗余调用
|
||||
|
||||
# 并行处理业务和技术查询
|
||||
questions_to_continue = [] # 存储需要调用 continue_answer 的 (original_query, parsed)
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
futures = []
|
||||
future_to_query = {} # 创建一个字典来映射 future 到 original_query
|
||||
|
||||
if busi_user_query:
|
||||
if model_type:
|
||||
# 如果是模型调用,直接使用 doubao_model
|
||||
futures.append(executor.submit(doubao_model, busi_user_query))
|
||||
future = executor.submit(doubao_model, busi_user_query)
|
||||
else:
|
||||
# 使用 qianwen_long_stream 并传入 file_id
|
||||
futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))
|
||||
future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)
|
||||
futures.append(future)
|
||||
future_to_query[future] = busi_user_query # 映射 future 到 busi_user_query
|
||||
|
||||
if tech_user_query:
|
||||
if model_type:
|
||||
# 如果是模型调用,直接使用 doubao_model
|
||||
futures.append(executor.submit(doubao_model, tech_user_query))
|
||||
future = executor.submit(doubao_model, tech_user_query)
|
||||
else:
|
||||
# 使用 qianwen_long_stream 并传入 file_id
|
||||
futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))
|
||||
|
||||
future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)
|
||||
futures.append(future)
|
||||
future_to_query[future] = tech_user_query # 映射 future 到 tech_user_query
|
||||
# 收集需要继续回答的问题
|
||||
initial_results = {}
|
||||
# 获取结果
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
original_query = future_to_query[future] # 获取对应的 original_query
|
||||
try:
|
||||
result = future.result()
|
||||
if result: # 确保结果不为空
|
||||
final_res.update(clean_json_string(result))
|
||||
parsed = clean_json_string(result)
|
||||
if isinstance(parsed, str): # flag为截断标记,如果不能解析且len(response)>5000,执行继续问答!
|
||||
questions_to_continue.append((original_query, parsed))
|
||||
elif isinstance(parsed, dict):
|
||||
initial_results.update(parsed)
|
||||
else:
|
||||
print(f"Parsed result is not a dict or str: {parsed}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
# 处理需要继续回答的问题
|
||||
if questions_to_continue:
|
||||
continued_results = process_continue_answers(questions_to_continue, model_type, file_id)
|
||||
final_res.update(continued_results)
|
||||
|
||||
# 合并初步结果
|
||||
final_res.update(initial_results)
|
||||
return final_res
|
||||
|
||||
|
||||
@ -325,11 +347,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
|
||||
if __name__ == "__main__":
|
||||
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
|
||||
# truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件(广水市教育局封闭管理)_procurement.pdf"
|
||||
procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
|
||||
procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf'
|
||||
docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
|
||||
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
|
||||
# file_id = upload_file(truncate_file)
|
||||
# processed_filepath = pdf2txt(procurement_path)
|
||||
processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
|
||||
final_res= get_business_requirements(procurement_path,processed_filepath)
|
||||
final_res= get_business_requirements(procurement_path,processed_filepath,1)
|
||||
print(json.dumps(final_res, ensure_ascii=False, indent=4))
|
||||
|
@ -1,11 +1,12 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from flask_app.general.model_continue_query import continue_answer
|
||||
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
|
||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
@ -487,22 +488,26 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
results = multi_threading(queries, "", "", 3) # 豆包
|
||||
else:
|
||||
results = multi_threading(queries, "", file_id, 2) # 豆包
|
||||
# technical_requirements = []
|
||||
technical_requirements_combined_res = {}
|
||||
temp_final={}
|
||||
if not results:
|
||||
print("errror!未获得大模型的回答!")
|
||||
else:
|
||||
# 第一步:收集需要调用 `continue_answer` 的问题和解析结果
|
||||
questions_to_continue = [] # 存储需要调用 continue_answer 的 (question, parsed)
|
||||
for question, response in results:
|
||||
parsed=clean_json_string(response)
|
||||
if isinstance(parsed, str): #flag为截断标记,如果不能解析且len(response)>5000,执行继续问答!
|
||||
parsed=continue_answer(question,parsed)
|
||||
technical_requirements_combined_res.update(parsed)
|
||||
# technical_requirements.append(response)
|
||||
# technical_requirements_combined_res = combine_json_results(technical_requirements)
|
||||
questions_to_continue.append((question, parsed))
|
||||
elif isinstance(parsed, dict):
|
||||
temp_final.update(parsed)
|
||||
# 第二步:多线程处理需要调用 `continue_answer` 的问题
|
||||
if questions_to_continue:
|
||||
continued_results = process_continue_answers(questions_to_continue, model_type, file_id)
|
||||
temp_final.update(continued_results)
|
||||
|
||||
"""根据所有键是否已添加处理技术要求"""
|
||||
# 更新原始采购需求字典
|
||||
final_res=combine_and_update_results(modified_data, technical_requirements_combined_res)
|
||||
final_res=combine_and_update_results(modified_data, temp_final)
|
||||
ffinal_res=all_postprocess(final_res)
|
||||
ffinal_res["货物列表"] = good_list
|
||||
# 输出最终的 JSON 字符串
|
||||
|
@ -76,9 +76,9 @@ if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf"
|
||||
procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf"
|
||||
procurement_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf"
|
||||
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
|
||||
invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf"
|
||||
invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
|
||||
res = fetch_procurement_reqs(procurement_path, invalid_path)
|
||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
||||
end_time = time.time()
|
||||
|
Loading…
x
Reference in New Issue
Block a user