12.18 截取pdf

This commit is contained in:
zy123 2024-12-18 12:26:27 +08:00
parent 456659b432
commit b8d310a0b0
5 changed files with 14 additions and 13 deletions

View File

@ -90,6 +90,7 @@ def fetch_project_basic_info(invalid_deleted_docx, merged_baseinfo_path, procure
if not procurement_path: if not procurement_path:
procurement_path = invalid_deleted_docx procurement_path = invalid_deleted_docx
basic_res = combine_basic_info(merged_baseinfo_path, procurement_path, clause_path, invalid_deleted_docx) basic_res = combine_basic_info(merged_baseinfo_path, procurement_path, clause_path, invalid_deleted_docx)
print(json.dumps(basic_res,ensure_ascii=False,indent=4))
base_info, good_list = post_process_baseinfo(basic_res, logger) base_info, good_list = post_process_baseinfo(basic_res, logger)
result = base_info, good_list result = base_info, good_list
end_time = time.time() end_time = time.time()
@ -122,7 +123,6 @@ def fetch_evaluation_standards(invalid_deleted_docx, evaluation_method_path,logg
start_time = time.time() start_time = time.time()
if not evaluation_method_path: if not evaluation_method_path:
evaluation_method_path = invalid_deleted_docx evaluation_method_path = invalid_deleted_docx
print(evaluation_method_path)
evaluation_standards_res = combine_evaluation_standards(evaluation_method_path,invalid_deleted_docx,2) evaluation_standards_res = combine_evaluation_standards(evaluation_method_path,invalid_deleted_docx,2)
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}

View File

@ -258,7 +258,7 @@ def generate_template(required_keys,full_text, type=1):
def get_business_requirements(procurement_path,processed_filepath): def get_business_requirements(procurement_path,processed_filepath):
required_keys = ["\s*术\s*要\s*求", "\s*务\s*要\s*求", "\s*务\s*要\s*求", "\s*他\s*要\s*求","\s*体\s*要\s*求","\s*设\s*要\s*求","\s*度\s*要\s*求","\s*期\s*要\s*求","\s*保\s*要\s*求","\s*训\s*要\s*求","\s*后\s*要\s*求"] required_keys = ["\s*术\s*要\s*求", "\s*务\s*要\s*求", "\s*务\s*要\s*求", "\s*他\s*要\s*求","\s*体\s*要\s*求","\s*设\s*要\s*求","\s*度\s*要\s*求","\s*期\s*要\s*求","\s*保\s*要\s*求","\s*训\s*要\s*求","\s*后\s*要\s*求"]
procurement_pdf_path=procurement_path procurement_pdf_path=procurement_path
if procurement_path.lower().endswith('.doc', '.docx'): if procurement_path.lower().endswith(('.doc', '.docx')):
procurement_pdf_path = docx2pdf(procurement_path) procurement_pdf_path = docx2pdf(procurement_path)
contained_keys = find_exists(procurement_pdf_path, required_keys) contained_keys = find_exists(procurement_pdf_path, required_keys)
print(contained_keys) print(contained_keys)
@ -289,10 +289,12 @@ def get_business_requirements(procurement_path,processed_filepath):
# TODO:改为先判断,再摘取 # TODO:改为先判断,再摘取
if __name__ == "__main__": if __name__ == "__main__":
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf" # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf"
docx_path=r'C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.docx' procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
# file_id = upload_file(truncate_file) # file_id = upload_file(truncate_file)
processed_filepath = pdf2txt(truncate_file) # processed_filepath = pdf2txt(procurement_path)
final_res= get_business_requirements(truncate_file, docx_path) processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
final_res= get_business_requirements(procurement_path,processed_filepath)
print(json.dumps(final_res, ensure_ascii=False, indent=4)) print(json.dumps(final_res, ensure_ascii=False, indent=4))

View File

@ -209,9 +209,9 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invali
if __name__ == "__main__": if __name__ == "__main__":
start_time=time.time() start_time=time.time()
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf" # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
merged_baseinfo_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf" merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx"
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf" # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" procurement_file_path = r"D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx"
clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json' clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path) res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
print("------------------------------------") print("------------------------------------")

View File

@ -387,7 +387,7 @@ def get_technical_requirements(invalid_path,processed_filepath):
processed_data=truncate_system_keys(cleaned_res['采购需求']) processed_data=truncate_system_keys(cleaned_res['采购需求'])
key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' 输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'} key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' 输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
modified_data=rename_keys(data_copy) modified_data=rename_keys(data_copy)
print(json.dumps(modified_data,ensure_ascii=False,indent=4)) # print(json.dumps(modified_data,ensure_ascii=False,indent=4))
user_query_template = """请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号,不可擅自增删内容,尤其是不可擅自添加序号。 user_query_template = """请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★和序号,不可擅自增删内容,尤其是不可擅自添加序号。
要求与指南 要求与指南
@ -538,10 +538,10 @@ if __name__ == "__main__":
# truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx" # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
# output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp" # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
# file_id = upload_file(truncate_file) # file_id = upload_file(truncate_file)
invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\8c63f0c9-d642-4f0c-918c-33db5efd6cd0\extract1.txt" invalid_path=r"D:\flask_project\flask_app\static\output\output1\9f63f6f1-6e91-44d6-83ca-189461bf151f\invalid_del.docx"
# file_id=upload_file(truncate_file) # file_id=upload_file(truncate_file)
# processed_filepath = pdf2txt(truncate_file) # processed_filepath = pdf2txt(truncate_file)
processed_filepath=r"C:\Users\Administrator\Desktop\fsdownload\8c63f0c9-d642-4f0c-918c-33db5efd6cd0\extract1.txt" processed_filepath=r"D:\flask_project\flask_app\static\output\output1\9f63f6f1-6e91-44d6-83ca-189461bf151f\extract1.txt"
res=get_technical_requirements(invalid_path,processed_filepath) res=get_technical_requirements(invalid_path,processed_filepath)
json_string = json.dumps(res, ensure_ascii=False, indent=4) json_string = json.dumps(res, ensure_ascii=False, indent=4)
print(json_string) print(json_string)

View File

@ -39,7 +39,6 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
# 获取并行任务的结果 # 获取并行任务的结果
technical_requirements = future_technical.result() technical_requirements = future_technical.result()
business_requirements = future_business.result() business_requirements = future_business.result()
# 构建最终的采购需求字典 # 构建最终的采购需求字典
procurement_reqs = { procurement_reqs = {
"采购需求": technical_requirements.get("采购需求", {}) "采购需求": technical_requirements.get("采购需求", {})