From 94b945e6b131148f63810f6623f7de8ce6ac58f2 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 17 Jan 2025 15:45:53 +0800 Subject: [PATCH] =?UTF-8?q?1.17=20=E5=B0=8Fbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/model_continue_query.py | 3 +- flask_app/general/多线程提问.py | 10 ++-- flask_app/general/截取pdf_main.py | 6 +-- flask_app/general/截取pdf通用函数.py | 57 ++++++++++++----------- flask_app/general/无效标和废标公共代码.py | 10 ++-- flask_app/货物标/截取pdf货物标版.py | 7 ++- flask_app/货物标/技术参数要求提取.py | 10 ++-- 7 files changed, 56 insertions(+), 47 deletions(-) diff --git a/flask_app/general/model_continue_query.py b/flask_app/general/model_continue_query.py index 769add4..e0df96e 100644 --- a/flask_app/general/model_continue_query.py +++ b/flask_app/general/model_continue_query.py @@ -48,7 +48,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None) # 根据模型类型选择调用的模型 if model_type == 1: # 使用 doubao_model - model_res = doubao_model(continue_query) + # model_res = doubao_model(continue_query) + model_res=qianwen_plus(continue_query) elif model_type == 2: # 使用 qianwen_long if file_id is None: raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!") diff --git a/flask_app/general/多线程提问.py b/flask_app/general/多线程提问.py index 5417c27..e10cb36 100644 --- a/flask_app/general/多线程提问.py +++ b/flask_app/general/多线程提问.py @@ -13,7 +13,8 @@ from dashscope import Assistants, Messages, Runs, Threads from llama_index.indices.managed.dashscope import DashScopeCloudRetriever from flask_app.general.doubao import doubao_model -from flask_app.general.通义千问long import qianwen_long, upload_file +from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus + prompt = """ # 角色 你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 @@ -165,11 +166,12 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type result_queue.put((ans_index, (question, qianwen_res))) elif llm_type==3: # print(f"doubao! question:{question}") - doubao_res=doubao_model(question,need_extra) - if not doubao_res: + # doubao_res=doubao_model(question,need_extra) + qianwen_plus_res=qianwen_plus(question,need_extra) + if not qianwen_plus_res: result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None else: - result_queue.put((ans_index, (question, doubao_res))) + result_queue.put((ans_index, (question, qianwen_plus_res))) else : assistant = pure_assistant() ans = send_message(assistant, message=question) diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py index 201ef18..11442a7 100644 --- a/flask_app/general/截取pdf_main.py +++ b/flask_app/general/截取pdf_main.py @@ -119,14 +119,14 @@ if __name__ == "__main__": # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" - pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf" + pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" - output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33" + output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)" # selections = [1, 4] # 仅处理 selection 4、1 # selections = [1, 2, 3, 5] # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering - files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering') + files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods') print(files) # print(files[-1]) # print(files[-2]) diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 9ef489b..361817f 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -118,11 +118,9 @@ def convert_to_pdf(file_path): def get_invalid_file(file_path, output_folder, common_header, begin_page): """ 从指定的PDF文件中提取无效部分并保存到输出文件夹中。 - begin_pattern从前往后匹配 end_pattern从后往前 + begin_pattern匹配开头 end_pattern匹配结尾 页数小于100不进行begin_pattern,默认start_page=0 页数大于200时end_pattern从前往后匹配 - - Args: file_path (str): 输入的PDF文件路径。 output_folder (str): 提取后文件的输出文件夹路径。 @@ -134,29 +132,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): # 定义开始模式 begin_patterns = [ regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请', + r'.*(? 200: - # print("总页数大于200页,结束页从前往后查找,跳过前30页。") #防止匹配到目录这块 + # print("总页数大于200页,结束页从前往后查找,跳过前30页。") #且先匹配合同 for pattern in end_patterns: for i in range(30, total_pages): text = page_texts[i] + exclusion_pattern = regex.compile( + r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) + if regex.search(exclusion_pattern,text): + continue if regex.search(pattern, text): # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i @@ -232,9 +230,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): future_end = executor.submit(find_end_page) start_page = future_start.result() end_page = future_end.result() - # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}") - # 验证页码范围 if start_page > end_page: print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})") @@ -356,6 +352,7 @@ def generate_end_pattern(extraction_stage, chapter_type=None): r'|' + common_patterns ) + end_pattern_template_third = ( r'(?start_page>begin_page + start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) - if end_page2 is None: - return start_page1, end_page1, None - - return start_page1, end_page1, end_page2 + if start_page2 is None or end_page2 is None: + return start_page1, end_page1, end_page1,end_page1 + return start_page1, end_page1, start_page2,end_page2 +#return start_page1, end_page1, end_page1 def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage): begin_pattern = regex.compile( r'.*(?