2.15 应该是pypdf2库的问题

This commit is contained in:
zy123 2025-02-15 22:30:15 +08:00
parent 110d1767f4
commit aea934cc31
7 changed files with 102 additions and 96 deletions

View File

@ -205,7 +205,7 @@ def get_pdf_page_count(file_path):
if __name__ == '__main__':
# 替换为你的文件路径和API URL
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
local_path_in = r"C:\Users\Administrator\Downloads\基础公司枞阳经开区新能源汽车零部件产业园基础设施建设项目二期桩基工程劳务分包.doc"
local_path_in = r"C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\tmp\invalid_added.pdf"
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
@ -213,8 +213,8 @@ if __name__ == '__main__':
# intermediate_docx = pdf2docx(local_path_in)
# if intermediate_docx:
# normal_pdf = docx2pdf(intermediate_docx, force=True)
# # downloaded_file=pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
downloaded_file=pdf2docx(local_path_in)
# downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"

View File

@ -108,9 +108,7 @@ if __name__ == '__main__':
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
# input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
# output=insert_mark(input)
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\invalid_added.docx'
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\invalid_added.docx'
res=delete_mark(doc_path)
if res:
print(res)
else:
print("No")
if not res:
print("yes")

View File

@ -1,14 +1,13 @@
import concurrent.futures
import os
import time
from memory_profiler import profile
from flask_app.general.merge_pdfs import merge_selected_pdfs
from flask_app.general.截取pdf通用函数 import check_pdf_pages
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
"""
统一的 PDF 截取和合并函数支持 'goods' 'engineering' 两种模式

View File

@ -303,7 +303,7 @@ def is_pure_image(docx_path, percentage=0.3):
return True
if __name__ == '__main__':
pdf_path=r"C:\Users\Administrator\Desktop\ztbfile.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\ztbfile.pdf"
res=is_scanned_pdf(pdf_path)
if res:
print("扫描型")

View File

@ -136,56 +136,56 @@ def process_and_stream(file_url, zb_type):
base_end_time = time.time()
logger.info(f"分段解析完成,耗时:{base_end_time - start_time:.2f}")
# #此时前端已完整接收到解析的所有内容后面的内容与前端展示无关主要是后处理1.extracted_result关键信息存储 2.技术偏离表 3.商务偏离表 4.投标人需提交的证明材料(目前后端存储了,前端还未展示)
# #后处理开始!!!
# output_json_path = os.path.join(output_folder, 'final_result.json')
# extracted_info_path = os.path.join(output_folder, 'extracted_result.json')
# includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
# final_result, extracted_info, tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation, proof_materials = outer_post_processing(
# combined_data, includes, good_list) #后处理 生成 extracted_info、商务 技术偏离数据 以及证明材料返给后端
#
# #后处理完毕!后面都是生成响应返回,不额外修改数据
# tech_deviation_response, tech_deviation_star_response, zigefuhe_deviation_response, shangwu_deviation_response, shangwu_star_deviation_response, proof_materials_response = generate_deviation_response(
# tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation,
# proof_materials, logger) #生成规范的响应
#
# # 使用通用响应函数
# yield sse_format(tech_deviation_response)
# yield sse_format(tech_deviation_star_response)
# yield sse_format(zigefuhe_deviation_response)
# yield sse_format(shangwu_deviation_response)
# yield sse_format(shangwu_star_deviation_response)
# yield sse_format(proof_materials_response)
#
# try:
# with open(extracted_info_path, 'w', encoding='utf-8') as json_file:
# json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
# logger.info(f"摘取后的数据已保存到 '{extracted_info_path}'")
# except IOError as e:
# logger.error(f"保存JSON文件时出错: {e}")
# log_error_unique_id(unique_id,1) # 记录失败的 unique_id
#
# try:
# with open(output_json_path, 'w', encoding='utf-8') as json_file:
# json.dump(final_result, json_file, ensure_ascii=False, indent=4)
# logger.info(f"合并后的数据已保存到 '{output_json_path}'")
# except IOError as e:
# logger.error(f"保存JSON文件时出错: {e}")
# log_error_unique_id(unique_id,1) # 记录失败的 unique_id
#
# extracted_info_response = create_response(
# message='extracted_info',
# status='success',
# data=json.dumps(extracted_info, ensure_ascii=False)
# )
# yield sse_format(extracted_info_response)
#
# complete_response = create_response(
# message='Combined_data',
# status='success',
# data=json.dumps(final_result, ensure_ascii=False)
# )
# yield sse_format(complete_response)
#此时前端已完整接收到解析的所有内容后面的内容与前端展示无关主要是后处理1.extracted_result关键信息存储 2.技术偏离表 3.商务偏离表 4.投标人需提交的证明材料(目前后端存储了,前端还未展示)
#后处理开始!!!
output_json_path = os.path.join(output_folder, 'final_result.json')
extracted_info_path = os.path.join(output_folder, 'extracted_result.json')
includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
final_result, extracted_info, tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation, proof_materials = outer_post_processing(
combined_data, includes, good_list) #后处理 生成 extracted_info、商务 技术偏离数据 以及证明材料返给后端
#后处理完毕!后面都是生成响应返回,不额外修改数据
tech_deviation_response, tech_deviation_star_response, zigefuhe_deviation_response, shangwu_deviation_response, shangwu_star_deviation_response, proof_materials_response = generate_deviation_response(
tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation,
proof_materials, logger) #生成规范的响应
# 使用通用响应函数
yield sse_format(tech_deviation_response)
yield sse_format(tech_deviation_star_response)
yield sse_format(zigefuhe_deviation_response)
yield sse_format(shangwu_deviation_response)
yield sse_format(shangwu_star_deviation_response)
yield sse_format(proof_materials_response)
try:
with open(extracted_info_path, 'w', encoding='utf-8') as json_file:
json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
logger.info(f"摘取后的数据已保存到 '{extracted_info_path}'")
except IOError as e:
logger.error(f"保存JSON文件时出错: {e}")
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
try:
with open(output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(final_result, json_file, ensure_ascii=False, indent=4)
logger.info(f"合并后的数据已保存到 '{output_json_path}'")
except IOError as e:
logger.error(f"保存JSON文件时出错: {e}")
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
extracted_info_response = create_response(
message='extracted_info',
status='success',
data=json.dumps(extracted_info, ensure_ascii=False)
)
yield sse_format(extracted_info_response)
complete_response = create_response(
message='Combined_data',
status='success',
data=json.dumps(final_result, ensure_ascii=False)
)
yield sse_format(complete_response)
final_response = create_response( #目前后端的逻辑是读取到'data'中有个'END',就终止连接
message='文件上传并处理成功',

View File

@ -89,6 +89,11 @@ def preprocess_files(output_folder, file_path, file_type,logger):
else:
invalid_deleted_docx = file_path
invalid_added_docx = ''
try:
# 尝试加载 .docx 文件
doc = Document(invalid_deleted_docx)
except Exception as e:
invalid_deleted_docx=pdf_path if pdf_path!="" else file_path #文档转换还是失败!最后的手段保证文件可用!
end_time=time.time()
logger.info(f"文件预处理总耗时:{end_time - start_time:.2f}")

View File

@ -43,7 +43,6 @@ def preprocess_files(output_folder, file_path, file_type,logger):
truncate_files=['','','','','','',file_path,''] #纯图片,无需切片
# print("切割出的文件:"+str(truncate_files))
# 处理各个部分
# invalid_docpath = invalid_added_docx # docx截取无效标部分
procurement_path = truncate_files[5] # 采购需求
evaluation_method_path = truncate_files[1] # 评标办法
qualification_path = truncate_files[2] # 资格审查
@ -55,25 +54,30 @@ def preprocess_files(output_folder, file_path, file_type,logger):
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
truncate_endtime = time.time()
logger.info(f"文件切分CPU耗时{truncate_endtime - start_time:.2f}")
# if not is_pure_image_flag:
# invalid_added_pdf = insert_mark(invalid_path)
# change_start=time.time()
# invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path用于废标项提取使用正则。
# change_end=time.time()
# logger.info(f"文档转换p2d耗时{change_end - change_start:.2f} 秒")
# try:
# # 尝试加载 .docx 文件
# doc = Document(invalid_added_docx)
# # print("yes")
# except Exception as e:
# # 捕获异常并打印错误信息
# invalid_added_docx=pdf2docx(invalid_path)
# invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
# if not invalid_deleted_docx:
# invalid_deleted_docx=pdf2docx(invalid_path)
# else: #主要是节约了pdf2docx的一块钱
invalid_deleted_docx=file_path
invalid_added_docx='' #由于传入的docx是纯图片型正则是提取不到的需要调用大模型。
if not is_pure_image_flag:
invalid_added_pdf = insert_mark(invalid_path)
change_start=time.time()
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path用于废标项提取使用正则。
change_end=time.time()
logger.info(f"文档转换p2d耗时{change_end - change_start:.2f}")
try:
# 尝试加载 .docx 文件
doc = Document(invalid_added_docx)
# print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
else: #主要是节约了pdf2docx的一块钱
invalid_deleted_docx=file_path
invalid_added_docx='' #由于传入的docx是纯图片型正则是提取不到的需要调用大模型。
try:
# 尝试加载 .docx 文件
doc = Document(invalid_deleted_docx)
except Exception as e:
invalid_deleted_docx=pdf_path if pdf_path!="" else file_path #文档转换还是失败!最后的手段保证文件可用!
end_time = time.time()
logger.info(f"文件预处理总耗时:{end_time - start_time:.2f}")
@ -231,21 +235,21 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
with concurrent.futures.ThreadPoolExecutor() as executor:
# 立即启动不依赖 knowledge_name 和 index 的任务
futures = {
# 'evaluation_standards': executor.submit(fetch_evaluation_standards,processed_data['invalid_deleted_docx'], #技术评分 商务评分
# processed_data['evaluation_method_path'],logger),
#
# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_added_docx'], #无效标与废标项
# output_folder,logger),
'evaluation_standards': executor.submit(fetch_evaluation_standards,processed_data['invalid_deleted_docx'], #技术评分 商务评分
processed_data['evaluation_method_path'],logger),
# 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
# processed_data['clause_path'],logger), #投标文件要求
#
# 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
# processed_data['clause_path'],logger), #开评定标流程
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_added_docx'], #无效标与废标项
output_folder,logger),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
processed_data['clause_path'],logger), #投标文件要求
'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
processed_data['clause_path'],logger), #开评定标流程
'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'], #基础信息
processed_data['procurement_path'],processed_data['clause_path'],logger),
# 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'], #基础信息
# processed_data['procurement_path'],processed_data['clause_path'],logger),
#
'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_deleted_docx'], #资格审查
processed_data['qualification_path'],
processed_data['notice_path'],logger),
@ -280,8 +284,8 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
}
yield json.dumps(default_evaluation, ensure_ascii=False)
# yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
# if collected_good_list is not None:
# yield json.dumps({'good_list': transform_json_values(collected_good_list)}, ensure_ascii=False)
if collected_good_list is not None:
yield json.dumps({'good_list': transform_json_values(collected_good_list)}, ensure_ascii=False)
#TODO:小解析考虑提速1直接pdf转文本再切分。后期考虑。