From 110d1767f4da29448a4355f61388070ea9f1ae4e Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Sat, 15 Feb 2025 14:47:26 +0800 Subject: [PATCH] =?UTF-8?q?2.15=20=E6=8E=92=E6=9F=A5=E6=88=AA=E5=8F=96pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/routes/货物标解析main.py | 35 ++++++++++++++++-------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index a245a0d..b832647 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -55,22 +55,25 @@ def preprocess_files(output_folder, file_path, file_type,logger): invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容) truncate_endtime = time.time() logger.info(f"文件切分CPU耗时:{truncate_endtime - start_time:.2f} 秒") - if not is_pure_image_flag: - invalid_added_pdf = insert_mark(invalid_path) - invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path,用于废标项提取,使用正则。 - try: - # 尝试加载 .docx 文件 - doc = Document(invalid_added_docx) - # print("yes") - except Exception as e: - # 捕获异常并打印错误信息 - invalid_added_docx=pdf2docx(invalid_path) - invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path - if not invalid_deleted_docx: - invalid_deleted_docx=pdf2docx(invalid_path) - else: #主要是节约了pdf2docx的一块钱 - invalid_deleted_docx=file_path - invalid_added_docx='' #由于传入的docx是纯图片型,正则是提取不到的,需要调用大模型。 + # if not is_pure_image_flag: + # invalid_added_pdf = insert_mark(invalid_path) + # change_start=time.time() + # invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path,用于废标项提取,使用正则。 + # change_end=time.time() + # logger.info(f"文档转换p2d耗时:{change_end - change_start:.2f} 秒") + # try: + # # 尝试加载 .docx 文件 + # doc = Document(invalid_added_docx) + # # print("yes") + # except Exception as e: + # # 捕获异常并打印错误信息 + # invalid_added_docx=pdf2docx(invalid_path) + # invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path + # if not invalid_deleted_docx: + # invalid_deleted_docx=pdf2docx(invalid_path) + # else: #主要是节约了pdf2docx的一块钱 + invalid_deleted_docx=file_path + invalid_added_docx='' #由于传入的docx是纯图片型,正则是提取不到的,需要调用大模型。 end_time = time.time() logger.info(f"文件预处理总耗时:{end_time - start_time:.2f} 秒")