zbparse/flask_app/货物标/提取采购需求main.py

84 lines
4.4 KiB
Python
Raw Normal View History

2024-09-23 15:49:30 +08:00
import concurrent.futures
2024-09-13 15:03:55 +08:00
import json
import os
2024-09-23 15:49:30 +08:00
import time
2024-12-19 14:39:35 +08:00
from flask_app.general.format_change import get_pdf_page_count
2024-12-17 14:47:19 +08:00
from flask_app.general.file2markdown import convert_file_to_markdown
2024-10-25 17:50:20 +08:00
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
2024-10-15 21:03:02 +08:00
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements
2024-10-22 10:06:22 +08:00
2024-11-16 16:14:53 +08:00
def fetch_procurement_reqs(procurement_path, invalid_path):
2024-12-17 15:42:20 +08:00
#procurement_path可能是pdf\docx
2024-10-17 15:33:58 +08:00
# 定义默认的 procurement_reqs 字典
DEFAULT_PROCUREMENT_REQS = {
2025-02-07 15:27:24 +08:00
"采购需求": {}, #对具体的货物采购要求 技术参数
"技术要求": [], #对供应商的技术、商务 服务要求 ,而不是对具体货物
"商务要求": [],
"服务要求": [],
"其他要求": []
2024-10-17 15:33:58 +08:00
}
2024-11-08 15:44:29 +08:00
# 如果 procurement_docpath 是空字符串,直接返回包含空字符串的字典
if not procurement_path:
2024-10-17 15:33:58 +08:00
return DEFAULT_PROCUREMENT_REQS.copy()
2024-11-08 15:44:29 +08:00
2024-10-17 15:33:58 +08:00
try:
proc_path = os.path.abspath(procurement_path)
invalid_path = os.path.abspath(invalid_path)
# 判断路径是否一致一致表示一开始procurement_path截取为空
if proc_path == invalid_path:
2024-12-19 14:21:33 +08:00
# 读取 PDF 页码数
2024-12-19 14:13:49 +08:00
page_count = get_pdf_page_count(procurement_path)
2025-01-17 16:40:07 +08:00
if page_count > 60: # 如果页码数大于60,不转markdown
tech_model_type= 2 #long
busi_model_type=3 #long-stream
2024-12-19 14:21:33 +08:00
processed_filepath = ""
else:
tech_model_type= 1 #doubao或者qianwen-plus
busi_model_type =4 #qianwen-plus
processed_filepath = convert_file_to_markdown(procurement_path,"extract3.txt") # invalid_path->markdown格式
2024-12-19 14:13:49 +08:00
else:
tech_model_type = 1 #doubao或者qianwen-plus
busi_model_type = 4
processed_filepath = convert_file_to_markdown(procurement_path) # 正常情况procurement_path->markdown格式
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
2024-10-17 15:33:58 +08:00
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务给线程池
2025-02-07 15:27:24 +08:00
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath, tech_model_type) #采购需求
2024-11-08 15:44:29 +08:00
time.sleep(0.5) # 保持原有的延时
2025-02-07 15:27:24 +08:00
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath, busi_model_type) #技术、商务、服务、其他要求
2024-10-17 15:33:58 +08:00
# 获取并行任务的结果
technical_requirements = future_technical.result()
business_requirements = future_business.result()
2024-11-08 15:44:29 +08:00
# 构建最终的采购需求字典
2024-10-17 15:33:58 +08:00
procurement_reqs = {
"采购需求": technical_requirements.get("采购需求", {})
2024-10-17 15:33:58 +08:00
}
2024-11-08 15:44:29 +08:00
procurement_reqs.update(business_requirements)
# 如果需要确保所有默认键存在,可以取消下面的注释
# for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
# procurement_reqs.setdefault(key, default_value)
2024-10-17 15:33:58 +08:00
return procurement_reqs
2024-09-23 15:49:30 +08:00
2024-10-17 15:33:58 +08:00
except Exception as e:
print(f"Error in fetch_procurement_reqs: {e}")
# 在出错时返回默认的包含空字符串的字典
return DEFAULT_PROCUREMENT_REQS.copy()
2024-09-13 15:03:55 +08:00
if __name__ == "__main__":
start_time = time.time()
2024-09-13 15:03:55 +08:00
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
2024-10-15 20:57:58 +08:00
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
2025-01-20 17:14:15 +08:00
procurement_path = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\需求\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件N510113202400010820250102001.pdf"
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
res = fetch_procurement_reqs(procurement_path, invalid_path)
2024-09-23 15:49:30 +08:00
print(json.dumps(res, ensure_ascii=False, indent=4))
end_time = time.time()
print("耗时:" + str(end_time - start_time))