2.15 排查截取pdf

This commit is contained in:
zy123 2025-02-15 14:47:26 +08:00
parent add2bad285
commit 110d1767f4

View File

@ -55,20 +55,23 @@ def preprocess_files(output_folder, file_path, file_type,logger):
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容) invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
truncate_endtime = time.time() truncate_endtime = time.time()
logger.info(f"文件切分CPU耗时{truncate_endtime - start_time:.2f}") logger.info(f"文件切分CPU耗时{truncate_endtime - start_time:.2f}")
if not is_pure_image_flag: # if not is_pure_image_flag:
invalid_added_pdf = insert_mark(invalid_path) # invalid_added_pdf = insert_mark(invalid_path)
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path用于废标项提取使用正则。 # change_start=time.time()
try: # invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path用于废标项提取使用正则。
# 尝试加载 .docx 文件 # change_end=time.time()
doc = Document(invalid_added_docx) # logger.info(f"文档转换p2d耗时{change_end - change_start:.2f} 秒")
# print("yes") # try:
except Exception as e: # # 尝试加载 .docx 文件
# 捕获异常并打印错误信息 # doc = Document(invalid_added_docx)
invalid_added_docx=pdf2docx(invalid_path) # # print("yes")
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path # except Exception as e:
if not invalid_deleted_docx: # # 捕获异常并打印错误信息
invalid_deleted_docx=pdf2docx(invalid_path) # invalid_added_docx=pdf2docx(invalid_path)
else: #主要是节约了pdf2docx的一块钱 # invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
# if not invalid_deleted_docx:
# invalid_deleted_docx=pdf2docx(invalid_path)
# else: #主要是节约了pdf2docx的一块钱
invalid_deleted_docx=file_path invalid_deleted_docx=file_path
invalid_added_docx='' #由于传入的docx是纯图片型正则是提取不到的需要调用大模型。 invalid_added_docx='' #由于传入的docx是纯图片型正则是提取不到的需要调用大模型。
end_time = time.time() end_time = time.time()