zbparse/flask_app/货物标/基础信息解析货物标版.py

# -*- encoding:utf-8 -*-
import json
import threading
import time
import concurrent.futures
from flask_app.general.json_utils import clean_json_string, add_outer_key
from flask_app.general.通用功能函数 import process_judge_questions, aggregate_basic_info
from flask_app.general.llm.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.llm.通义千问long import upload_file
from flask_app.old_version.判断是否分包等_old import merge_json_to_list
from flask_app.general.投标人须知正文提取指定内容 import extract_from_notice
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs


def get_base_info(merged_baseinfo_path,clause_path,invalid_path):
    file_id = upload_file(merged_baseinfo_path)
    baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
    # baseinfo_file_path = r'D:\flask_project\flask_app\static\提示词\基本信息货物标.txt'
    questions = read_questions_from_file(baseinfo_file_path)
    baseinfo_results = multi_threading(questions, "", file_id, 2)  # 1代表使用百炼rag 2代表使用qianwen-long
    baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
    chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop())   #取出并处理最后一个回答 '是否xxx'
    baseinfo_list.append(merged)
    judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
    # judge_file_path =r'D:\flask_project\flask_app\static\提示词\是否相关问题.txt'
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        # 提交两个任务
        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, invalid_path,baseinfo_list)
        future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3)  #重新招标

        # 等待两个任务完成并获取结果
        future1.result()  # process_judge_questions 直接修改 baseinfo_list，不需要返回值
        rebidding_situation = future2.result()
    update_json = add_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
    baseinfo_list.append(update_json)
    return baseinfo_list

def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invalid_path):
    baseinfo_list = []
    temp_list = []
    procurement_reqs = {}
    # 定义一个线程函数来获取基础信息
    def get_base_info_thread():         #传统的基础信息提取
        nonlocal temp_list
        temp_list = get_base_info(merged_baseinfo_path,clause_path,invalid_path)
    # 定义一个线程函数来获取采购需求
    def fetch_procurement_reqs_thread():   #采购要求提取
        nonlocal procurement_reqs
        procurement_reqs = fetch_procurement_reqs(procurement_path,invalid_path)
    # 创建并启动获取基础信息的线程
    thread1 = threading.Thread(target=get_base_info_thread)
    thread1.start()
    # 等待一秒后启动获取采购需求的线程
    time.sleep(1)
    thread2 = threading.Thread(target=fetch_procurement_reqs_thread)
    thread2.start()
    # 等待两个线程都完成
    thread1.join()
    thread2.join()
    # 合并结果
    baseinfo_list += temp_list  # temp_list 是一个列表
    baseinfo_list.append(procurement_reqs)  # procurement_reqs 是一个字典
    aggregated_baseinfo = aggregate_basic_info(baseinfo_list,'goods')     #重新组织基础信息结构

    return {"基础信息": aggregated_baseinfo}

if __name__ == "__main__":
    start_time=time.time()
    # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\ztbfile_merged_baseinfo.pdf"
    # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
    clause_path=r'D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\clause1.json'
    invalid_path=r"D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\ztbfile.docx"
    procurement_path=r'D:\flask_project\flask_app\static\output\output1\ecd3374a-a19e-475e-a4dd-e803a3bc1fbf\ztbfile_procurement.pdf'
    # res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
    res=combine_basic_info(merged_baseinfo_path,procurement_path,clause_path,invalid_path)
    print("------------------------------------")
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()
    print("elasped time:"+str(end_time-start_time))