From 9c3c6d889ef97da2341994ca1b177319c62a1dc1 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 19 Dec 2024 17:32:39 +0800 Subject: [PATCH] =?UTF-8?q?12.19=20invalid=5Fpath=E8=BD=ACmd=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F=E5=89=8D=E5=A2=9E=E5=8A=A0=E5=88=A4=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/file2markdown.py | 2 +- flask_app/general/model_continue_query.py | 66 +++++++++++++++++++++-- flask_app/货物标/商务服务其他要求提取.py | 38 ++++++++++--- flask_app/货物标/技术参数要求提取.py | 21 +++++--- flask_app/货物标/提取采购需求main.py | 4 +- 5 files changed, 107 insertions(+), 24 deletions(-) diff --git a/flask_app/general/file2markdown.py b/flask_app/general/file2markdown.py index ae37bff..8501cf6 100644 --- a/flask_app/general/file2markdown.py +++ b/flask_app/general/file2markdown.py @@ -55,7 +55,7 @@ def convert_file_to_markdown(file_path): 'markdown_details': 1, 'apply_document_tree': 1, 'dpi': 216, # 分辨率设置默认为144 dpi, - 'get_image':None + 'get_image':'none' }) print("request time: ", resp.elapsed.total_seconds()) data = json.loads(resp.text) diff --git a/flask_app/general/model_continue_query.py b/flask_app/general/model_continue_query.py index 84c4dfa..24ee8e0 100644 --- a/flask_app/general/model_continue_query.py +++ b/flask_app/general/model_continue_query.py @@ -1,7 +1,11 @@ # -*- encoding:utf-8 -*- +import concurrent.futures import json from flask_app.general.doubao import doubao_model from flask_app.general.json_utils import clean_json_string +from flask_app.general.通义千问long import qianwen_long, qianwen_long_stream + + def generate_continue_query(original_query, original_answer): """ 生成继续回答的查询内容。 @@ -25,13 +29,15 @@ def generate_continue_query(original_query, original_answer): return continue_query -def continue_answer(original_query, original_answer): +def continue_answer(original_query, original_answer, model_type=1, file_id=None): """ - 实现“继续回答”功能。 + 实现“继续回答”功能,支持选择不同模型。 参数: - original_query (str): 原始问题。 - original_answer (str): 上一次模型的回答(可能是不完整的 JSON 字符串)。 + - model_type (int): 指定使用的模型类型(1: doubao_model, 2: qianwen_long, 3: qianwen_long_stream)。 + - file_id (str): 可选的文件 ID,默认为 None。 返回: - json_data (dict): 拼接后的完整 JSON 数据。如果解析失败,返回 None。 @@ -40,10 +46,21 @@ def continue_answer(original_query, original_answer): continue_query = generate_continue_query(original_query, original_answer) print("继续问答") print(continue_query) - # 调用模型获取继续的回答 - model_res = doubao_model(continue_query) - # print(model_res) + # 根据模型类型选择调用的模型 + if model_type == 1: # 使用 doubao_model + model_res = doubao_model(continue_query) + elif model_type == 2: # 使用 qianwen_long + if file_id is None: + raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!") + model_res = qianwen_long(file_id, continue_query) + elif model_type == 3: # 使用 qianwen_long_stream + if file_id is None: + raise ValueError("file_id 必须在使用 qianwen_long_stream 模型时提供!") + model_res = qianwen_long_stream(file_id, continue_query) + else: + raise ValueError(f"无效的模型类型: {model_type}") + # 拼接原始回答和模型的继续回答 # 删除 original_answer 的最右边的 `"` 和 `\n` clean_original = original_answer.rstrip('"\n') @@ -68,6 +85,45 @@ def continue_answer(original_query, original_answer): print(e) return None +def process_continue_answers(questions_to_continue, model_type, file_id): + """ + 并行处理需要调用 `continue_answer` 的问题。 + + 参数: + - questions_to_continue (list of tuples): 需要继续回答的问题,每个元素是 (original_query, parsed_answer)。 + - model_type (int): 指定使用的模型类型。 + - file_id (str): 可选的文件 ID,默认为 None。 + + 返回: + - dict: 继续回答后的结果合并。 + """ + continued_results = {} + if not questions_to_continue: + return continued_results + + with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(questions_to_continue))) as executor: + future_to_question = { + executor.submit( + continue_answer, + original_query, + parsed_answer, + model_type, + None if model_type == 1 else file_id + ): original_query + for original_query, parsed_answer in questions_to_continue + } + + for future in concurrent.futures.as_completed(future_to_question): + original_query = future_to_question[future] + try: + continued_result = future.result() + if continued_result: # 确保结果不为空 + continued_results.update(continued_result) + except Exception as e: + print(f"在处理问题 '{original_query}' 时发生错误: {e}") #TODO:排查一下 + + return continued_results + # 示例使用 if __name__ == "__main__": original_query = """请根据货物标中采购要求部分的内容,告诉我"▲乘客电梯"的技术参数或采购要求是什么。由于该货物存在 4 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为"▲乘客电梯-1";键值为一个列表,列表中包含若干描述"▲乘客电梯"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。 diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index f7bac2b..2fc5e0c 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -5,6 +5,7 @@ from PyPDF2 import PdfReader import textwrap from flask_app.general.doubao import read_txt_to_string, pdf2txt from flask_app.general.json_utils import combine_json_results, clean_json_string +from flask_app.general.model_continue_query import continue_answer, process_continue_answers from flask_app.general.通义千问long import upload_file, qianwen_long_stream from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content from flask_app.general.format_change import docx2pdf, pdf2docx @@ -291,33 +292,54 @@ def get_business_requirements(procurement_path, processed_filepath, model_type): file_id = upload_file(procurement_docx_path) # 只上传一次文件,避免冗余调用 # 并行处理业务和技术查询 + questions_to_continue = [] # 存储需要调用 continue_answer 的 (original_query, parsed) with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: futures = [] + future_to_query = {} # 创建一个字典来映射 future 到 original_query + if busi_user_query: if model_type: # 如果是模型调用,直接使用 doubao_model - futures.append(executor.submit(doubao_model, busi_user_query)) + future = executor.submit(doubao_model, busi_user_query) else: # 使用 qianwen_long_stream 并传入 file_id - futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)) + future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1) + futures.append(future) + future_to_query[future] = busi_user_query # 映射 future 到 busi_user_query if tech_user_query: if model_type: # 如果是模型调用,直接使用 doubao_model - futures.append(executor.submit(doubao_model, tech_user_query)) + future = executor.submit(doubao_model, tech_user_query) else: # 使用 qianwen_long_stream 并传入 file_id - futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)) - + future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1) + futures.append(future) + future_to_query[future] = tech_user_query # 映射 future 到 tech_user_query + # 收集需要继续回答的问题 + initial_results = {} # 获取结果 for future in concurrent.futures.as_completed(futures): + original_query = future_to_query[future] # 获取对应的 original_query try: result = future.result() if result: # 确保结果不为空 - final_res.update(clean_json_string(result)) + parsed = clean_json_string(result) + if isinstance(parsed, str): # flag为截断标记,如果不能解析且len(response)>5000,执行继续问答! + questions_to_continue.append((original_query, parsed)) + elif isinstance(parsed, dict): + initial_results.update(parsed) + else: + print(f"Parsed result is not a dict or str: {parsed}") except Exception as e: print(f"An error occurred: {e}") + # 处理需要继续回答的问题 + if questions_to_continue: + continued_results = process_continue_answers(questions_to_continue, model_type, file_id) + final_res.update(continued_results) + # 合并初步结果 + final_res.update(initial_results) return final_res @@ -325,11 +347,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type): if __name__ == "__main__": # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件(广水市教育局封闭管理)_procurement.pdf" - procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx' + procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf' docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx' # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # file_id = upload_file(truncate_file) # processed_filepath = pdf2txt(procurement_path) processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt' - final_res= get_business_requirements(procurement_path,processed_filepath) + final_res= get_business_requirements(procurement_path,processed_filepath,1) print(json.dumps(final_res, ensure_ascii=False, indent=4)) diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index ef0d0e0..d54a4f6 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -1,11 +1,12 @@ # -*- encoding:utf-8 -*- +import concurrent.futures import json import os import re import time from collections import defaultdict from copy import deepcopy -from flask_app.general.model_continue_query import continue_answer +from flask_app.general.model_continue_query import continue_answer, process_continue_answers from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.format_change import pdf2docx from flask_app.general.多线程提问 import multi_threading @@ -487,22 +488,26 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1): results = multi_threading(queries, "", "", 3) # 豆包 else: results = multi_threading(queries, "", file_id, 2) # 豆包 - # technical_requirements = [] - technical_requirements_combined_res = {} + temp_final={} if not results: print("errror!未获得大模型的回答!") else: + # 第一步:收集需要调用 `continue_answer` 的问题和解析结果 + questions_to_continue = [] # 存储需要调用 continue_answer 的 (question, parsed) for question, response in results: parsed=clean_json_string(response) if isinstance(parsed, str): #flag为截断标记,如果不能解析且len(response)>5000,执行继续问答! - parsed=continue_answer(question,parsed) - technical_requirements_combined_res.update(parsed) - # technical_requirements.append(response) - # technical_requirements_combined_res = combine_json_results(technical_requirements) + questions_to_continue.append((question, parsed)) + elif isinstance(parsed, dict): + temp_final.update(parsed) + # 第二步:多线程处理需要调用 `continue_answer` 的问题 + if questions_to_continue: + continued_results = process_continue_answers(questions_to_continue, model_type, file_id) + temp_final.update(continued_results) """根据所有键是否已添加处理技术要求""" # 更新原始采购需求字典 - final_res=combine_and_update_results(modified_data, technical_requirements_combined_res) + final_res=combine_and_update_results(modified_data, temp_final) ffinal_res=all_postprocess(final_res) ffinal_res["货物列表"] = good_list # 输出最终的 JSON 字符串 diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 2e02db4..c588789 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -76,9 +76,9 @@ if __name__ == "__main__": start_time = time.time() output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf" - procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf" + procurement_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf" procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" - invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" + invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf" res = fetch_procurement_reqs(procurement_path, invalid_path) print(json.dumps(res, ensure_ascii=False, indent=4)) end_time = time.time()