12.19 invalid_path转md格式前增加判断

This commit is contained in:
zy123 2024-12-19 17:32:39 +08:00
parent 5ec5b78c6a
commit 9c3c6d889e
5 changed files with 107 additions and 24 deletions

View File

@ -55,7 +55,7 @@ def convert_file_to_markdown(file_path):
'markdown_details': 1, 'markdown_details': 1,
'apply_document_tree': 1, 'apply_document_tree': 1,
'dpi': 216, # 分辨率设置默认为144 dpi, 'dpi': 216, # 分辨率设置默认为144 dpi,
'get_image':None 'get_image':'none'
}) })
print("request time: ", resp.elapsed.total_seconds()) print("request time: ", resp.elapsed.total_seconds())
data = json.loads(resp.text) data = json.loads(resp.text)

View File

@ -1,7 +1,11 @@
# -*- encoding:utf-8 -*- # -*- encoding:utf-8 -*-
import concurrent.futures
import json import json
from flask_app.general.doubao import doubao_model from flask_app.general.doubao import doubao_model
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.general.通义千问long import qianwen_long, qianwen_long_stream
def generate_continue_query(original_query, original_answer): def generate_continue_query(original_query, original_answer):
""" """
生成继续回答的查询内容 生成继续回答的查询内容
@ -25,13 +29,15 @@ def generate_continue_query(original_query, original_answer):
return continue_query return continue_query
def continue_answer(original_query, original_answer): def continue_answer(original_query, original_answer, model_type=1, file_id=None):
""" """
实现继续回答功能 实现继续回答功能支持选择不同模型
参数: 参数:
- original_query (str): 原始问题 - original_query (str): 原始问题
- original_answer (str): 上一次模型的回答可能是不完整的 JSON 字符串 - original_answer (str): 上一次模型的回答可能是不完整的 JSON 字符串
- model_type (int): 指定使用的模型类型1: doubao_model, 2: qianwen_long, 3: qianwen_long_stream
- file_id (str): 可选的文件 ID默认为 None
返回: 返回:
- json_data (dict): 拼接后的完整 JSON 数据如果解析失败返回 None - json_data (dict): 拼接后的完整 JSON 数据如果解析失败返回 None
@ -40,10 +46,21 @@ def continue_answer(original_query, original_answer):
continue_query = generate_continue_query(original_query, original_answer) continue_query = generate_continue_query(original_query, original_answer)
print("继续问答") print("继续问答")
print(continue_query) print(continue_query)
# 调用模型获取继续的回答
model_res = doubao_model(continue_query)
# print(model_res) # 根据模型类型选择调用的模型
if model_type == 1: # 使用 doubao_model
model_res = doubao_model(continue_query)
elif model_type == 2: # 使用 qianwen_long
if file_id is None:
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")
model_res = qianwen_long(file_id, continue_query)
elif model_type == 3: # 使用 qianwen_long_stream
if file_id is None:
raise ValueError("file_id 必须在使用 qianwen_long_stream 模型时提供!")
model_res = qianwen_long_stream(file_id, continue_query)
else:
raise ValueError(f"无效的模型类型: {model_type}")
# 拼接原始回答和模型的继续回答 # 拼接原始回答和模型的继续回答
# 删除 original_answer 的最右边的 `"` 和 `\n` # 删除 original_answer 的最右边的 `"` 和 `\n`
clean_original = original_answer.rstrip('"\n') clean_original = original_answer.rstrip('"\n')
@ -68,6 +85,45 @@ def continue_answer(original_query, original_answer):
print(e) print(e)
return None return None
def process_continue_answers(questions_to_continue, model_type, file_id):
"""
并行处理需要调用 `continue_answer` 的问题
参数:
- questions_to_continue (list of tuples): 需要继续回答的问题每个元素是 (original_query, parsed_answer)
- model_type (int): 指定使用的模型类型
- file_id (str): 可选的文件 ID默认为 None
返回:
- dict: 继续回答后的结果合并
"""
continued_results = {}
if not questions_to_continue:
return continued_results
with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(questions_to_continue))) as executor:
future_to_question = {
executor.submit(
continue_answer,
original_query,
parsed_answer,
model_type,
None if model_type == 1 else file_id
): original_query
for original_query, parsed_answer in questions_to_continue
}
for future in concurrent.futures.as_completed(future_to_question):
original_query = future_to_question[future]
try:
continued_result = future.result()
if continued_result: # 确保结果不为空
continued_results.update(continued_result)
except Exception as e:
print(f"在处理问题 '{original_query}' 时发生错误: {e}") #TODO:排查一下
return continued_results
# 示例使用 # 示例使用
if __name__ == "__main__": if __name__ == "__main__":
original_query = """请根据货物标中采购要求部分的内容,告诉我"▲乘客电梯"的技术参数或采购要求是什么。由于该货物存在 4 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为"▲乘客电梯-1";键值为一个列表,列表中包含若干描述"▲乘客电梯"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。 original_query = """请根据货物标中采购要求部分的内容,告诉我"▲乘客电梯"的技术参数或采购要求是什么。由于该货物存在 4 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为"▲乘客电梯-1";键值为一个列表,列表中包含若干描述"▲乘客电梯"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。

View File

@ -5,6 +5,7 @@ from PyPDF2 import PdfReader
import textwrap import textwrap
from flask_app.general.doubao import read_txt_to_string, pdf2txt from flask_app.general.doubao import read_txt_to_string, pdf2txt
from flask_app.general.json_utils import combine_json_results, clean_json_string from flask_app.general.json_utils import combine_json_results, clean_json_string
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
from flask_app.general.通义千问long import upload_file, qianwen_long_stream from flask_app.general.通义千问long import upload_file, qianwen_long_stream
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf, pdf2docx from flask_app.general.format_change import docx2pdf, pdf2docx
@ -291,33 +292,54 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
file_id = upload_file(procurement_docx_path) # 只上传一次文件,避免冗余调用 file_id = upload_file(procurement_docx_path) # 只上传一次文件,避免冗余调用
# 并行处理业务和技术查询 # 并行处理业务和技术查询
questions_to_continue = [] # 存储需要调用 continue_answer 的 (original_query, parsed)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = [] futures = []
future_to_query = {} # 创建一个字典来映射 future 到 original_query
if busi_user_query: if busi_user_query:
if model_type: if model_type:
# 如果是模型调用,直接使用 doubao_model # 如果是模型调用,直接使用 doubao_model
futures.append(executor.submit(doubao_model, busi_user_query)) future = executor.submit(doubao_model, busi_user_query)
else: else:
# 使用 qianwen_long_stream 并传入 file_id # 使用 qianwen_long_stream 并传入 file_id
futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)) future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)
futures.append(future)
future_to_query[future] = busi_user_query # 映射 future 到 busi_user_query
if tech_user_query: if tech_user_query:
if model_type: if model_type:
# 如果是模型调用,直接使用 doubao_model # 如果是模型调用,直接使用 doubao_model
futures.append(executor.submit(doubao_model, tech_user_query)) future = executor.submit(doubao_model, tech_user_query)
else: else:
# 使用 qianwen_long_stream 并传入 file_id # 使用 qianwen_long_stream 并传入 file_id
futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)) future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)
futures.append(future)
future_to_query[future] = tech_user_query # 映射 future 到 tech_user_query
# 收集需要继续回答的问题
initial_results = {}
# 获取结果 # 获取结果
for future in concurrent.futures.as_completed(futures): for future in concurrent.futures.as_completed(futures):
original_query = future_to_query[future] # 获取对应的 original_query
try: try:
result = future.result() result = future.result()
if result: # 确保结果不为空 if result: # 确保结果不为空
final_res.update(clean_json_string(result)) parsed = clean_json_string(result)
if isinstance(parsed, str): # flag为截断标记如果不能解析且len(response)>5000执行继续问答
questions_to_continue.append((original_query, parsed))
elif isinstance(parsed, dict):
initial_results.update(parsed)
else:
print(f"Parsed result is not a dict or str: {parsed}")
except Exception as e: except Exception as e:
print(f"An error occurred: {e}") print(f"An error occurred: {e}")
# 处理需要继续回答的问题
if questions_to_continue:
continued_results = process_continue_answers(questions_to_continue, model_type, file_id)
final_res.update(continued_results)
# 合并初步结果
final_res.update(initial_results)
return final_res return final_res
@ -325,11 +347,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
if __name__ == "__main__": if __name__ == "__main__":
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
# truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf" # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf"
procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx' procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf'
docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx' docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
# file_id = upload_file(truncate_file) # file_id = upload_file(truncate_file)
# processed_filepath = pdf2txt(procurement_path) # processed_filepath = pdf2txt(procurement_path)
processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt' processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
final_res= get_business_requirements(procurement_path,processed_filepath) final_res= get_business_requirements(procurement_path,processed_filepath,1)
print(json.dumps(final_res, ensure_ascii=False, indent=4)) print(json.dumps(final_res, ensure_ascii=False, indent=4))

View File

@ -1,11 +1,12 @@
# -*- encoding:utf-8 -*- # -*- encoding:utf-8 -*-
import concurrent.futures
import json import json
import os import os
import re import re
import time import time
from collections import defaultdict from collections import defaultdict
from copy import deepcopy from copy import deepcopy
from flask_app.general.model_continue_query import continue_answer from flask_app.general.model_continue_query import continue_answer, process_continue_answers
from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.general.format_change import pdf2docx from flask_app.general.format_change import pdf2docx
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
@ -487,22 +488,26 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
results = multi_threading(queries, "", "", 3) # 豆包 results = multi_threading(queries, "", "", 3) # 豆包
else: else:
results = multi_threading(queries, "", file_id, 2) # 豆包 results = multi_threading(queries, "", file_id, 2) # 豆包
# technical_requirements = [] temp_final={}
technical_requirements_combined_res = {}
if not results: if not results:
print("errror!未获得大模型的回答!") print("errror!未获得大模型的回答!")
else: else:
# 第一步:收集需要调用 `continue_answer` 的问题和解析结果
questions_to_continue = [] # 存储需要调用 continue_answer 的 (question, parsed)
for question, response in results: for question, response in results:
parsed=clean_json_string(response) parsed=clean_json_string(response)
if isinstance(parsed, str): #flag为截断标记如果不能解析且len(response)>5000执行继续问答 if isinstance(parsed, str): #flag为截断标记如果不能解析且len(response)>5000执行继续问答
parsed=continue_answer(question,parsed) questions_to_continue.append((question, parsed))
technical_requirements_combined_res.update(parsed) elif isinstance(parsed, dict):
# technical_requirements.append(response) temp_final.update(parsed)
# technical_requirements_combined_res = combine_json_results(technical_requirements) # 第二步:多线程处理需要调用 `continue_answer` 的问题
if questions_to_continue:
continued_results = process_continue_answers(questions_to_continue, model_type, file_id)
temp_final.update(continued_results)
"""根据所有键是否已添加处理技术要求""" """根据所有键是否已添加处理技术要求"""
# 更新原始采购需求字典 # 更新原始采购需求字典
final_res=combine_and_update_results(modified_data, technical_requirements_combined_res) final_res=combine_and_update_results(modified_data, temp_final)
ffinal_res=all_postprocess(final_res) ffinal_res=all_postprocess(final_res)
ffinal_res["货物列表"] = good_list ffinal_res["货物列表"] = good_list
# 输出最终的 JSON 字符串 # 输出最终的 JSON 字符串

View File

@ -76,9 +76,9 @@ if __name__ == "__main__":
start_time = time.time() start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf" procurement_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf"
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
res = fetch_procurement_reqs(procurement_path, invalid_path) res = fetch_procurement_reqs(procurement_path, invalid_path)
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))
end_time = time.time() end_time = time.time()