zbparse/flask_app/货物标/提取采购需求main.py

import concurrent.futures
import json
import os
import time
from  flask_app.general.format_change import get_pdf_page_count
from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements


def fetch_procurement_reqs(procurement_path, invalid_path):
    #procurement_path可能是pdf\docx
    # 定义默认的 procurement_reqs 字典
    DEFAULT_PROCUREMENT_REQS = {
        "采购需求": {},      #对具体的货物采购要求 技术参数
        "技术要求": [],      #对供应商的技术、商务 服务要求 ，而不是对具体货物
        "商务要求": [],
        "服务要求": [],
        "其他要求": []
    }

    # 如果 procurement_docpath 是空字符串，直接返回包含空字符串的字典
    if not procurement_path:
        return DEFAULT_PROCUREMENT_REQS.copy()

    try:
        proc_path = os.path.abspath(procurement_path)
        invalid_path = os.path.abspath(invalid_path)
        page_count = get_pdf_page_count(procurement_path)

        if page_count > 60:    #转换过多，消耗的tokens 以及 文档转换的费用过高！退而求其次用qianwen-long
            tech_model_type = 2  # long-stream
            busi_model_type = 3  # long-stream
            processed_filepath = ""
            # 如果procurement_path截取成功，则更新invalid_path
            if proc_path != invalid_path:
                invalid_path = proc_path
        else:
            tech_model_type = 1  # doubao或者qianwen-plus
            busi_model_type = 4  # qianwen-plus
            # 根据是否截取成功调用不同参数
            if proc_path == invalid_path:
                processed_filepath = convert_file_to_markdown(procurement_path, "extract3.txt")
            else:
                processed_filepath = convert_file_to_markdown(procurement_path)
        # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # 提交任务给线程池
            future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath, tech_model_type)   #采购需求
            time.sleep(0.5)  # 保持原有的延时
            future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath, busi_model_type)   #技术、商务、服务、其他要求
            # 获取并行任务的结果
            technical_requirements = future_technical.result()
            business_requirements = future_business.result()
        # 构建最终的采购需求字典
        procurement_reqs = {
            "采购需求": technical_requirements.get("采购需求", {})
        }

        procurement_reqs.update(business_requirements)

        # 如果需要确保所有默认键存在，可以取消下面的注释
        # for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
        #     procurement_reqs.setdefault(key, default_value)

        return procurement_reqs

    except Exception as e:
        print(f"Error in fetch_procurement_reqs: {e}")
        # 在出错时返回默认的包含空字符串的字典
        return DEFAULT_PROCUREMENT_REQS.copy()

if __name__ == "__main__":
    start_time = time.time()
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
    # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_procurement.pdf"
    procurement_path = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\需求\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件（N510113202400010820250102001）.pdf"
    procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
    invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
    res = fetch_procurement_reqs(procurement_path, invalid_path)
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time = time.time()
    print("耗时:" + str(end_time - start_time))