From f5261087b474cd5fee1ff3a2a2a134f80caca87c Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 19 Dec 2024 14:13:49 +0800 Subject: [PATCH] =?UTF-8?q?12.19=20=E4=BF=AE=E5=A4=8D=E8=B1=86=E5=8C=85?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E4=BD=BF=E7=94=A8=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/clean_pdf.py | 18 +++++++++++++++++- flask_app/general/doubao.py | 12 ++++++++---- flask_app/货物标/技术参数要求提取.py | 25 +++++++++++-------------- flask_app/货物标/提取采购需求main.py | 12 +++++++++--- 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py index 0e40bf2..d55dc6d 100644 --- a/flask_app/general/clean_pdf.py +++ b/flask_app/general/clean_pdf.py @@ -1,5 +1,9 @@ import re from PyPDF2 import PdfReader + +from flask_app.general.format_change import docx2pdf + + def extract_common_header(pdf_path): def get_headers(pdf_document, start_page, pages_to_read): @@ -105,7 +109,19 @@ def is_scanned_pdf(file_path, max_pages=15): return False # 不是扫描型 return True # 前 max_pages 页都没有文本 - +def get_pdf_page_count(file_path): + """ + 获取 PDF 文件的页码数量 + """ + try: + pdf_path=file_path + if file_path.lower().endswith(('.doc', '.docx')): + pdf_path = docx2pdf(file_path) + reader = PdfReader(pdf_path) + return len(reader.pages) + except Exception as e: + print(f"读取 PDF 页码时出错:{e}") + return 0 if __name__ == '__main__': file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf" res=is_scanned_pdf(file_path) diff --git a/flask_app/general/doubao.py b/flask_app/general/doubao.py index b0c6b22..6d27461 100644 --- a/flask_app/general/doubao.py +++ b/flask_app/general/doubao.py @@ -173,16 +173,20 @@ def read_txt_to_string(file_path): - file_path (str): txt文件的路径 返回: - - str: 包含文件内容的字符串 + - str: 包含文件内容的字符串,如果出错返回空字符串 "" """ + if not file_path: # 检查文件路径是否为空 + return "" + try: with open(file_path, 'r', encoding='utf-8') as file: # 确保使用适当的编码 content = file.read() # 使用 read() 保持文件格式 return content - except FileNotFoundError: - return "错误:文件未找到。" + except (FileNotFoundError, IOError): # 捕获文件未找到或I/O错误 + return "" except Exception as e: - return f"错误:读取文件时发生错误。详细信息:{e}" + # 可根据需要记录错误日志,但返回值仍是空字符串 + return "" @sleep_and_retry @limits(calls=10, period=1) # 每秒最多调用10次 diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 9500c6c..84fc462 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -359,27 +359,26 @@ def generate_prompt(judge_res, full_text=None): return base_prompt #文件内容以markdown格式组织,其中表格部分(若有)以html语法组织, -def get_technical_requirements(invalid_path,processed_filepath): +def get_technical_requirements(invalid_path,processed_filepath,model_type=1): + judge_res="" file_id = "" - model_type = 1 # 默认使用豆包 - first_query_template="""该文件是否说明了采购需求,即需要采购哪些内容(包括货物、设备、系统、功能模块等)?如果有,请回答'是',否则,回答'否' -文件内容: -{full_text} - """ - judge_query = generate_full_user_query(processed_filepath, first_query_template) - # print(judge_query) - judge_res = doubao_model(judge_query) - if '否' in judge_res: + full_text = read_txt_to_string(processed_filepath) + if model_type: + first_query_template = """该文件是否说明了采购需求,即需要采购哪些内容(包括货物、设备、系统、功能模块等)?如果有,请回答'是',否则,回答'否' +{} +""" + judge_query = first_query_template.format(f"文件内容:{full_text}") + judge_res = doubao_model(judge_query) + if '否' in judge_res or model_type==0: model_type = 0 # 使用qianwen-long+invalid_path print("no!调用invalid_path") if invalid_path.lower().endswith('.pdf'): #确保上传的是docx invalid_path = pdf2docx(invalid_path) - file_id=upload_file(invalid_path) + file_id = upload_file(invalid_path) user_query = generate_prompt(judge_res) model_res=qianwen_long(file_id,user_query) print(model_res) else: - full_text = read_txt_to_string(processed_filepath) user_query=generate_prompt(judge_res,full_text) model_res=doubao_model(user_query) print(model_res) @@ -465,7 +464,6 @@ def get_technical_requirements(invalid_path,processed_filepath): modified_key = key.replace('.', '下的') # 使用修改后的键填充第一个占位符,原始键填充第二个占位符 if model_type: - full_text = read_txt_to_string(processed_filepath) new_query = user_query_template.format(modified_key, key, modified_key,f"文件内容:{full_text}") #转豆包后取消注释 else: new_query = user_query_template.format(modified_key, key, modified_key,"") @@ -477,7 +475,6 @@ def get_technical_requirements(invalid_path,processed_filepath): # 将键中的 '.' 替换为 '下的' modified_grouped_key = grouped_key.replace('.', '下的') if model_type: - full_text = read_txt_to_string(processed_filepath) new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key, modified_grouped_key, f"文件内容:{full_text}") else: diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 4f0f4a3..a8f35ad 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -1,7 +1,7 @@ import concurrent.futures import json import time - +from flask_app.general.clean_pdf import get_pdf_page_count from flask_app.general.doubao import pdf2txt from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.format_change import pdf2docx @@ -27,13 +27,19 @@ def fetch_procurement_reqs(procurement_path, invalid_path): return DEFAULT_PROCUREMENT_REQS.copy() try: - processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 + if procurement_path==invalid_path: + page_count = get_pdf_page_count(procurement_path) + model_type=0 + processed_filepath="" + else: + model_type = 1 + processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 # processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: # 提交任务给线程池 - future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath) + future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type) time.sleep(0.5) # 保持原有的延时 future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath) # 获取并行任务的结果