12.19 invalid_path转md格式前增加判断

This commit is contained in:
zy123 2024-12-19 17:32:39 +08:00
parent 5ec5b78c6a
commit 9c3c6d889e
5 changed files with 107 additions and 24 deletions

View File

@ -55,7 +55,7 @@ def convert_file_to_markdown(file_path):
'markdown_details': 1,
'apply_document_tree': 1,
'dpi': 216, # 分辨率设置默认为144 dpi,
'get_image':None
'get_image':'none'
})
print("request time: ", resp.elapsed.total_seconds())
data = json.loads(resp.text)

View File

@ -1,7 +1,11 @@
# -*- encoding:utf-8 -*-
import concurrent.futures
import json
from flask_app.general.doubao import doubao_model
from flask_app.general.json_utils import clean_json_string
from flask_app.general.通义千问long import qianwen_long, qianwen_long_stream
def generate_continue_query(original_query, original_answer):
"""
生成继续回答的查询内容
@ -25,13 +29,15 @@ def generate_continue_query(original_query, original_answer):
return continue_query
def continue_answer(original_query, original_answer):
def continue_answer(original_query, original_answer, model_type=1, file_id=None):
"""
实现继续回答功能
实现继续回答功能支持选择不同模型
参数:
- original_query (str): 原始问题
- original_answer (str): 上一次模型的回答可能是不完整的 JSON 字符串
- model_type (int): 指定使用的模型类型1: doubao_model, 2: qianwen_long, 3: qianwen_long_stream
- file_id (str): 可选的文件 ID默认为 None
返回:
- json_data (dict): 拼接后的完整 JSON 数据如果解析失败返回 None
@ -40,10 +46,21 @@ def continue_answer(original_query, original_answer):
continue_query = generate_continue_query(original_query, original_answer)
print("继续问答")
print(continue_query)
# 调用模型获取继续的回答
model_res = doubao_model(continue_query)
# print(model_res)
# 根据模型类型选择调用的模型
if model_type == 1: # 使用 doubao_model
model_res = doubao_model(continue_query)
elif model_type == 2: # 使用 qianwen_long
if file_id is None:
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")
model_res = qianwen_long(file_id, continue_query)
elif model_type == 3: # 使用 qianwen_long_stream
if file_id is None:
raise ValueError("file_id 必须在使用 qianwen_long_stream 模型时提供!")
model_res = qianwen_long_stream(file_id, continue_query)
else:
raise ValueError(f"无效的模型类型: {model_type}")
# 拼接原始回答和模型的继续回答
# 删除 original_answer 的最右边的 `"` 和 `\n`
clean_original = original_answer.rstrip('"\n')
@ -68,6 +85,45 @@ def continue_answer(original_query, original_answer):
print(e)
return None
def process_continue_answers(questions_to_continue, model_type, file_id):
"""
并行处理需要调用 `continue_answer` 的问题
参数:
- questions_to_continue (list of tuples): 需要继续回答的问题每个元素是 (original_query, parsed_answer)
- model_type (int): 指定使用的模型类型
- file_id (str): 可选的文件 ID默认为 None
返回:
- dict: 继续回答后的结果合并
"""
continued_results = {}
if not questions_to_continue:
return continued_results
with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(questions_to_continue))) as executor:
future_to_question = {
executor.submit(
continue_answer,
original_query,
parsed_answer,
model_type,
None if model_type == 1 else file_id
): original_query
for original_query, parsed_answer in questions_to_continue
}
for future in concurrent.futures.as_completed(future_to_question):
original_query = future_to_question[future]
try:
continued_result = future.result()
if continued_result: # 确保结果不为空
continued_results.update(continued_result)
except Exception as e:
print(f"在处理问题 '{original_query}' 时发生错误: {e}") #TODO:排查一下
return continued_results
# 示例使用
if __name__ == "__main__":
original_query = """请根据货物标中采购要求部分的内容,告诉我"▲乘客电梯"的技术参数或采购要求是什么。由于该货物存在 4 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为"▲乘客电梯-1";键值为一个列表,列表中包含若干描述"▲乘客电梯"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。

View File

@ -5,6 +5,7 @@ from PyPDF2 import PdfReader
import textwrap
from flask_app.general.doubao import read_txt_to_string, pdf2txt
from flask_app.general.json_utils import combine_json_results, clean_json_string
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
from flask_app.general.通义千问long import upload_file, qianwen_long_stream
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf, pdf2docx
@ -291,33 +292,54 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
file_id = upload_file(procurement_docx_path) # 只上传一次文件,避免冗余调用
# 并行处理业务和技术查询
questions_to_continue = [] # 存储需要调用 continue_answer 的 (original_query, parsed)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = []
future_to_query = {} # 创建一个字典来映射 future 到 original_query
if busi_user_query:
if model_type:
# 如果是模型调用,直接使用 doubao_model
futures.append(executor.submit(doubao_model, busi_user_query))
future = executor.submit(doubao_model, busi_user_query)
else:
# 使用 qianwen_long_stream 并传入 file_id
futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))
future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)
futures.append(future)
future_to_query[future] = busi_user_query # 映射 future 到 busi_user_query
if tech_user_query:
if model_type:
# 如果是模型调用,直接使用 doubao_model
futures.append(executor.submit(doubao_model, tech_user_query))
future = executor.submit(doubao_model, tech_user_query)
else:
# 使用 qianwen_long_stream 并传入 file_id
futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))
future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)
futures.append(future)
future_to_query[future] = tech_user_query # 映射 future 到 tech_user_query
# 收集需要继续回答的问题
initial_results = {}
# 获取结果
for future in concurrent.futures.as_completed(futures):
original_query = future_to_query[future] # 获取对应的 original_query
try:
result = future.result()
if result: # 确保结果不为空
final_res.update(clean_json_string(result))
parsed = clean_json_string(result)
if isinstance(parsed, str): # flag为截断标记如果不能解析且len(response)>5000执行继续问答
questions_to_continue.append((original_query, parsed))
elif isinstance(parsed, dict):
initial_results.update(parsed)
else:
print(f"Parsed result is not a dict or str: {parsed}")
except Exception as e:
print(f"An error occurred: {e}")
# 处理需要继续回答的问题
if questions_to_continue:
continued_results = process_continue_answers(questions_to_continue, model_type, file_id)
final_res.update(continued_results)
# 合并初步结果
final_res.update(initial_results)
return final_res
@ -325,11 +347,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
if __name__ == "__main__":
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
# truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf"
procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf'
docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
# file_id = upload_file(truncate_file)
# processed_filepath = pdf2txt(procurement_path)
processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
final_res= get_business_requirements(procurement_path,processed_filepath)
final_res= get_business_requirements(procurement_path,processed_filepath,1)
print(json.dumps(final_res, ensure_ascii=False, indent=4))

View File

@ -1,11 +1,12 @@
# -*- encoding:utf-8 -*-
import concurrent.futures
import json
import os
import re
import time
from collections import defaultdict
from copy import deepcopy
from flask_app.general.model_continue_query import continue_answer
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.general.format_change import pdf2docx
from flask_app.general.多线程提问 import multi_threading
@ -487,22 +488,26 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
results = multi_threading(queries, "", "", 3) # 豆包
else:
results = multi_threading(queries, "", file_id, 2) # 豆包
# technical_requirements = []
technical_requirements_combined_res = {}
temp_final={}
if not results:
print("errror!未获得大模型的回答!")
else:
# 第一步:收集需要调用 `continue_answer` 的问题和解析结果
questions_to_continue = [] # 存储需要调用 continue_answer 的 (question, parsed)
for question, response in results:
parsed=clean_json_string(response)
if isinstance(parsed, str): #flag为截断标记如果不能解析且len(response)>5000执行继续问答
parsed=continue_answer(question,parsed)
technical_requirements_combined_res.update(parsed)
# technical_requirements.append(response)
# technical_requirements_combined_res = combine_json_results(technical_requirements)
questions_to_continue.append((question, parsed))
elif isinstance(parsed, dict):
temp_final.update(parsed)
# 第二步:多线程处理需要调用 `continue_answer` 的问题
if questions_to_continue:
continued_results = process_continue_answers(questions_to_continue, model_type, file_id)
temp_final.update(continued_results)
"""根据所有键是否已添加处理技术要求"""
# 更新原始采购需求字典
final_res=combine_and_update_results(modified_data, technical_requirements_combined_res)
final_res=combine_and_update_results(modified_data, temp_final)
ffinal_res=all_postprocess(final_res)
ffinal_res["货物列表"] = good_list
# 输出最终的 JSON 字符串

View File

@ -76,9 +76,9 @@ if __name__ == "__main__":
start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf"
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf"
invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
res = fetch_procurement_reqs(procurement_path, invalid_path)
print(json.dumps(res, ensure_ascii=False, indent=4))
end_time = time.time()