zbparse/flask_app/main/招标文件解析.py

# -*- encoding:utf-8 -*-
import json
import logging
import os
import time
from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.json_utils import nest_json_under_key, transform_json_values, combine_json_results
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
import concurrent.futures
from flask_app.main.基础信息整合 import project_basic_info
from flask_app.main.资格审查模块 import combine_review_standards
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.main.format_change import pdf2docx, docx2pdf
from flask_app.main.docx截取docx import copy_docx

global_logger = None


def get_global_logger(unique_id):
    if unique_id is None:
        return logging.getLogger()  # 获取默认的日志器
    logger = logging.getLogger(unique_id)
    return logger


# 可能有问题：pdf转docx导致打勾符号消失
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
    # 根据文件类型处理文件路径
    global docx_path, pdf_path
    if file_type == 1:  # docx
        docx_path = downloaded_file_path
        pdf_path = docx2pdf(docx_path)  # 将docx转换为pdf以供后续处理
    elif file_type == 2:  # pdf
        pdf_path = downloaded_file_path
        docx_path = pdf2docx(pdf_path)  # 将pdf转换为docx以供上传到知识库

    # 上传知识库
    knowledge_name = "招标解析" + unique_id
    index = addfileToKnowledge(docx_path, knowledge_name)

    # 调用截取PDF多次
    truncate_files = truncate_pdf_multiple(pdf_path, output_folder)  # [前附表, 评标办法, 须知正文, 资格审查条件]
    print(truncate_files)

    # 处理各个部分
    truncate0_docpath = pdf2docx(truncate_files[0])  # 投标人须知前附表转docx

    invalid_docpath = copy_docx(docx_path)     #docx截取无效标部分

    truncate_jsonpath=extract_tables_main(truncate0_docpath, output_folder)  # 投标人须知前附表docx->json,从表格提取数据
    truncate0 = truncate_files[0]     #投标人须知前附表
    truncate1 = truncate_files[1]     #评标办法前附表
    truncate3 = truncate_files[3]     #资格审查表
    clause_path = convert_clause_to_json(truncate_files[2], output_folder)  # 投标人须知正文条款pdf->json

    return {
        'input_file_path':downloaded_file_path,
        'output_folder':output_folder,
        'truncate0': truncate0,
        'truncate1': truncate1,
        'truncate3': truncate3,
        'knowledge_index': index,
        'knowledge_name': knowledge_name,
        'truncate0_jsonpath': truncate_jsonpath,
        'clause_path': clause_path,
        'invalid_docpath': invalid_docpath
    }


# 基本信息
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path):  # 投标人须知前附表
    global_logger.info("starting基础信息...")
    basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
    global_logger.info("基础信息done")
    return basic_res


# 形式、响应、资格评审
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
    global_logger.info("starting资格审查...")
    review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
                                                    clause_path,input_file,output_folder)
    global_logger.info("资格审查done")
    return review_standards_res


# 评分细则
def fetch_evaluation_standards(truncate1):  # 评标办法前附表
    global_logger.info("starting商务标技术标...")
    evaluation_standards_res = combine_evaluation_standards(truncate1)
    global_logger.info("商务标技术标done")
    return evaluation_standards_res


# 无效、废标项解析
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
    # 废标项要求：千问
    global_logger.info("starting无效标与废标...")
    find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
    global_logger.info("无效标与废标done...")
    return find_invalid_res


# 投标文件要求
def fetch_bidding_documents_requirements(clause_path):
    global_logger.info("starting投标文件要求...")
    fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
    qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
    global_logger.info("投标文件要求done...")
    return qualify_nested_res


# 开评定标流程
def fetch_bid_opening(clause_path):
    global_logger.info("starting开评定标流程...")
    fetch_bid_opening_json = extract_from_notice(clause_path, 2)
    qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
    global_logger.info("开评定标流程done...")
    return qualify_nested_res


def main_processing(output_folder, downloaded_file_path, file_type, unique_id):  # file_type=1->docx  file_type=2->pdf
    global global_logger
    global_logger = get_global_logger(unique_id)
    # Preprocess files and get necessary data paths and knowledge index
    processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit all tasks to the executor
        futures = {
            'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'],
                                         processed_data['truncate0'], output_folder,
                                         processed_data['clause_path']),
            'review_standards': executor.submit(fetch_review_standards, processed_data['truncate1'],
                                                processed_data['truncate3'],
                                                processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
                                                processed_data['clause_path'],processed_data['input_file_path'],processed_data['output_folder']),
            'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder, processed_data['truncate0_jsonpath'],
                                                    processed_data['clause_path'], processed_data['truncate3']),
            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
                                                              processed_data['clause_path']),
            'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
        }

        comprehensive_responses = []
        # Collect results in the defined order
        for key in ['base_info', 'review_standards', 'evaluation_standards', 'invalid_requirements',
                    'bidding_documents_requirements', 'opening_bid']:
            try:
                # Wait for the future to complete and get the result
                result = futures[key].result()
                comprehensive_responses.append(result)
            except Exception as exc:
                global_logger.info(f"Error processing {key}: {exc}")
    # 合并 JSON 结果
    combined_final_result = combine_json_results(comprehensive_responses)
    modified_json = transform_json_values(combined_final_result)

    final_result_path = os.path.join(output_folder, "final_result.json")
    with open(final_result_path, 'w', encoding='utf-8') as file:
        json.dump(modified_json, file, ensure_ascii=False, indent=2)
        global_logger.info("final_result.json has been saved")
    deleteKnowledge(processed_data['knowledge_index'])
    return final_result_path


# 目前返回结果：
#   "opening_bid": "{<br>    \"开评定标流程\": {<br>        \"开标\": {<br>            \"开标时间和地点\": [<br>                \"招标人在本章第4.2.1项规定的投标截止时间（开标时间）在“电子交易平台”上公开进行开标，所有投标人均应当准时在线参加开标。\",<br>                \"招标人通过互联网在投标人须知前附表规定的地点组织开标，并在投标截止时间30分钟前，使用CA数字证书登录“电子交易平台”，进入“开标室”选择相应标段作在线开标的准备工作。\",<br>                \"投标人应当在能够保证设施设备可靠、互联网畅通的任意地点，通过互联网在线参加开标。在投标截止时间前，使用加密其投标文件的CA数字证书登录“电子交易平台”，进入“开标室”选择所投标段进行签到，并实时在线关注招标人的操作情况。5.2开标程序\",<br>                \"主持人按下列程序在“电子交易平台”的“开标室”进行在线开标：（1）宣布开标纪律；（2）公布主持人、招标人代表、监标人等有关人员姓名；（3）公布在投标截止时间前投标文件的递交情况；（4）公布投标保证金递交情况；（5）按照投标人须知前附表规定抽取评标基准价下浮值（如有）；规定最高投标限价计算方法的，计算并公布最高投标限价（如适用），当众公布后记录在案；（6）读取已解密的投标文件的内容；（7）公布投标人名称、标段名称、投标保证金的递交情况、投标报价、项目经理姓名及其他内容，并生成开标记录；（8）开标结束。\",<br>                \"在本章第5.2.1（6）目规定的时间内，非因“电子交易平台”原因造成投标文件未解密的，视为投标人撤回投标文件。已解密的投标文件少于三个的，招标失败；已解密的投标文件不少于三个，开标继续进行。\"<br>            ],<br>            \"开标异议\": [<br>                \"投标人对开标有异议的，应当在开标过程中提出；招标人当场对异议作出答复，并记入开标记录。异议与答复应通过“开标室”在“异议与答复”菜单以书面形式进行。本处所称异议是指投标人在开标过程中对投标文件提交、投标截止时间、开标程序、开标记录以及投标人和招标人或者投标人相互之间存在利益冲突的情形等提出的质疑。\",<br>                \"投标人异议成立的，招标人将及时采取纠正措施，或者提交评标委员会评审确认；投标人异议不成立的，招标人将当场给予解释说明。\"<br>            ],<br>            \"特殊情况的处置\": [<br>                \"因“电子交易平台”系统故障导致无法投标的，交易中心及时通知招标人，招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的，由投标人自行承担后果。\",<br>                \"因“电子交易平台”系统故障导致无法正常开标的，招标人将暂停开标，待系统恢复正常后继续开标。\",<br>                \"“电子交易平台”系统故障是指下列情形：（1）系统服务器发生故障，无法访问或无法使用系统；（2）系统的软件或数据库出现错误，不能进行正常操作；（3）系统发现有安全漏洞，有潜在的泄密危险；（4）出现断电、断网事故；（5）其他无法保证招投标过程正常进行的情形。\"<br>            ]<br>        },<br>        \"评标\": {<br>            \"评标委员会\": [<br>                \"评标由招标人依法组建的评标委员会负责。评标委员会由招标人代表以及有关技术、经济等方面的专家组成。评标委员会成员人数以及技术、经济等方面专家的确定方式见投标人须知前附表。\",<br>                \"评标委员会成员有下列情形之一的，应当回避：（1）
# }

# def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
#     global global_logger
#     global_logger = get_global_logger(unique_id)
#     processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
#
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         # 提交任务到线程池
#         futures = {
#             'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'],
#                                          processed_data['truncate_files'][0], output_folder,
#                                          processed_data['clause_path']),
#             'review_standards': executor.submit(fetch_review_standards, processed_data['truncate_files'][1],
#                                                 processed_data['truncate_files'][4],
#                                                 processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
#                                                 processed_data['clause_path']),
#             'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]),
#             'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
#                                                     output_folder, processed_data['truncate0_jsonpath'],
#                                                     processed_data['clause_path'], processed_data['truncate_files'][4]),
#             'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
#                                                               processed_data['clause_path']),
#             'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
#         }
#         # 逐个提交任务，每次提交间隔1秒
#         for task_name, func, args in futures:
#             futures[task_name] = executor.submit(func, *args)
#             time.sleep(1)  # 每提交一个任务后暂停1秒
#
#         # 根据任务完成的顺序处理和返回结果
#         for future in concurrent.futures.as_completed(futures.values()):
#             key = next(key for key, val in futures.items() if val == future)  # 获取完成任务的key
#             try:
#                 result = future.result()
#                 modified_result = transform_json_values({key: result})
#                 yield f"data: {json.dumps(modified_result)}\n\n"
#             except Exception as exc:
#                 global_logger.info(f"Error processing {key}: {exc}")
#                 yield f"data: {json.dumps({'error': f'Error processing {key}: {str(exc)}'})}\n\n"
#
#     deleteKnowledge(processed_data['knowledge_index'])

if __name__ == "__main__":
    output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3"

    # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
    # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
    # clause_path = convert_clause_to_json(truncate1, output_folder)
    # truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json")

    start_time = time.time()
    file_type = 1        #1:docx 2:pdf 3:其他
    input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3\\zbtest20.docx"
    # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")

    preprocess_files(output_folder, input_file, file_type, "unique_id")
    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算耗时
    print(f"Function execution took {elapsed_time} seconds.")