2.15 排查截取pdf
This commit is contained in:
parent
add2bad285
commit
110d1767f4
@ -55,20 +55,23 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
|
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
|
||||||
truncate_endtime = time.time()
|
truncate_endtime = time.time()
|
||||||
logger.info(f"文件切分CPU耗时:{truncate_endtime - start_time:.2f} 秒")
|
logger.info(f"文件切分CPU耗时:{truncate_endtime - start_time:.2f} 秒")
|
||||||
if not is_pure_image_flag:
|
# if not is_pure_image_flag:
|
||||||
invalid_added_pdf = insert_mark(invalid_path)
|
# invalid_added_pdf = insert_mark(invalid_path)
|
||||||
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path,用于废标项提取,使用正则。
|
# change_start=time.time()
|
||||||
try:
|
# invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path,用于废标项提取,使用正则。
|
||||||
# 尝试加载 .docx 文件
|
# change_end=time.time()
|
||||||
doc = Document(invalid_added_docx)
|
# logger.info(f"文档转换p2d耗时:{change_end - change_start:.2f} 秒")
|
||||||
# print("yes")
|
# try:
|
||||||
except Exception as e:
|
# # 尝试加载 .docx 文件
|
||||||
# 捕获异常并打印错误信息
|
# doc = Document(invalid_added_docx)
|
||||||
invalid_added_docx=pdf2docx(invalid_path)
|
# # print("yes")
|
||||||
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
# except Exception as e:
|
||||||
if not invalid_deleted_docx:
|
# # 捕获异常并打印错误信息
|
||||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
# invalid_added_docx=pdf2docx(invalid_path)
|
||||||
else: #主要是节约了pdf2docx的一块钱
|
# invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
||||||
|
# if not invalid_deleted_docx:
|
||||||
|
# invalid_deleted_docx=pdf2docx(invalid_path)
|
||||||
|
# else: #主要是节约了pdf2docx的一块钱
|
||||||
invalid_deleted_docx=file_path
|
invalid_deleted_docx=file_path
|
||||||
invalid_added_docx='' #由于传入的docx是纯图片型,正则是提取不到的,需要调用大模型。
|
invalid_added_docx='' #由于传入的docx是纯图片型,正则是提取不到的,需要调用大模型。
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user