zbparse/flask_app/货物标/提取采购需求main.py

84 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import concurrent.futures
import json
import os
import time
from flask_app.general.format_change import get_pdf_page_count
from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements
def fetch_procurement_reqs(procurement_path, invalid_path):
#procurement_path可能是pdf\docx
# 定义默认的 procurement_reqs 字典
DEFAULT_PROCUREMENT_REQS = {
"采购需求": {}, #对具体的货物采购要求 技术参数
"技术要求": [], #对供应商的技术、商务 服务要求 ,而不是对具体货物
"商务要求": [],
"服务要求": [],
"其他要求": []
}
# 如果 procurement_docpath 是空字符串,直接返回包含空字符串的字典
if not procurement_path:
return DEFAULT_PROCUREMENT_REQS.copy()
try:
proc_path = os.path.abspath(procurement_path)
invalid_path = os.path.abspath(invalid_path)
# 判断路径是否一致一致表示一开始procurement_path截取为空
if proc_path == invalid_path:
# 读取 PDF 页码数
page_count = get_pdf_page_count(procurement_path) #注意这里的procurement_path可能是docx的invalid_path
if page_count > 60: # 如果页码数大于60,不转markdown
tech_model_type= 2 #long
busi_model_type=3 #long-stream
processed_filepath = ""
else:
tech_model_type= 1 #doubao或者qianwen-plus
busi_model_type =4 #qianwen-plus
processed_filepath = convert_file_to_markdown(procurement_path,"extract3.txt") # invalid_path->markdown格式
else:
tech_model_type = 1 #doubao或者qianwen-plus
busi_model_type = 4
processed_filepath = convert_file_to_markdown(procurement_path) # 正常情况procurement_path->markdown格式
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务给线程池
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath, tech_model_type) #采购需求
time.sleep(0.5) # 保持原有的延时
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath, busi_model_type) #技术、商务、服务、其他要求
# 获取并行任务的结果
technical_requirements = future_technical.result()
business_requirements = future_business.result()
# 构建最终的采购需求字典
procurement_reqs = {
"采购需求": technical_requirements.get("采购需求", {})
}
procurement_reqs.update(business_requirements)
# 如果需要确保所有默认键存在,可以取消下面的注释
# for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
# procurement_reqs.setdefault(key, default_value)
return procurement_reqs
except Exception as e:
print(f"Error in fetch_procurement_reqs: {e}")
# 在出错时返回默认的包含空字符串的字典
return DEFAULT_PROCUREMENT_REQS.copy()
if __name__ == "__main__":
start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\需求\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件N510113202400010820250102001.pdf"
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
res = fetch_procurement_reqs(procurement_path, invalid_path)
print(json.dumps(res, ensure_ascii=False, indent=4))
end_time = time.time()
print("耗时:" + str(end_time - start_time))