diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index ced2134..760bdf1 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -138,6 +138,15 @@ def pdf2docx(local_path_in): def doc2docx(local_path_in): if not local_path_in: return "" + # 获取文件名和所在文件夹 + filename, folder = get_filename_and_folder(local_path_in) + + # 检查是否已经存在同名的 .docx 文件 + docx_file_path = os.path.join(folder, f"{filename}.docx") + if os.path.exists(docx_file_path): + print(f"Skipping conversion, {docx_file_path} already exists.") + return docx_file_path + remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d' receive_download_url = upload_file(local_path_in, remote_url) print(receive_download_url) @@ -149,6 +158,13 @@ def doc2docx(local_path_in): def docx2pdf(local_path_in): if not local_path_in: return "" + # 获取文件名和所在文件夹 + filename, folder = get_filename_and_folder(local_path_in) + # 检查是否已经存在同名的 .pdf 文件 + pdf_file_path = os.path.join(folder, f"{filename}.pdf") + if os.path.exists(pdf_file_path): + print(f"Skipping conversion, {pdf_file_path} already exists.") + return pdf_file_path remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' receive_download_url = upload_file(local_path_in, remote_url) filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 84fc462..0e7fa1b 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -372,7 +372,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1): if '否' in judge_res or model_type==0: model_type = 0 # 使用qianwen-long+invalid_path print("no!调用invalid_path") - if invalid_path.lower().endswith('.pdf'): #确保上传的是docx + if invalid_path.lower().endswith('.pdf'): #确保上传的是docx upload中一定是docx,但是get_deviation中可能上传的是pdf invalid_path = pdf2docx(invalid_path) file_id = upload_file(invalid_path) user_query = generate_prompt(judge_res) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index a8f35ad..b205363 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -27,13 +27,19 @@ def fetch_procurement_reqs(procurement_path, invalid_path): return DEFAULT_PROCUREMENT_REQS.copy() try: - if procurement_path==invalid_path: + if procurement_path == invalid_path: + # 读取 PDF 页码数 page_count = get_pdf_page_count(procurement_path) - model_type=0 - processed_filepath="" + + if page_count > 80: # 如果页码数大于 50 + model_type = 0 + processed_filepath = "" + else: + model_type = 1 + processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 else: model_type = 1 - processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 + processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 # processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: