zbparse/flask_app/routes/upload.py

# flask_app/routes/upload.py
import multiprocessing
import urllib.parse
from flask import Blueprint, request, jsonify,g
import json
import os
import time
from flask_app.general.format_change import download_file
from flask_app.routes.工程标解析main import engineering_bid_main
from flask_app.routes.货物标解析main import goods_bid_main
from flask_app.general.post_processing import outer_post_processing
from flask_app.routes.utils import generate_deviation_response, validate_and_setup_logger, create_response, sse_format, \
    log_error_unique_id
from flask_app.ConnectionLimiter import require_connection_limit, require_execution_timeout

upload_bp = Blueprint('upload', __name__)

def _child_target(main_func, queue, output_folder, file_path, file_type, unique_id):
    """
    子进程中调用 `main_func`（它是一个生成器函数），
    将其 yield 出的数据逐条放进队列，最后放一个 None 表示结束。
    """
    try:
        for data in main_func(output_folder, file_path, file_type, unique_id):
            queue.put(data)
    except Exception as e:
        # 如果要把异常也传给父进程，以便父进程可感知
        queue.put(json.dumps({'error': str(e)}, ensure_ascii=False))
    finally:
        queue.put(None)
def run_in_subprocess(main_func, output_folder, file_path, file_type, unique_id):
    """
    启动子进程调用 `main_func(...)`，并在父进程流式获取其输出（通过 Queue）。
    子进程结束时，操作系统回收其内存；父进程则保持实时输出。
    """
    queue = multiprocessing.Queue()
    p = multiprocessing.Process(
        target=_child_target,
        args=(main_func, queue, output_folder, file_path, file_type, unique_id)
    )
    p.start()

    while True:
        item = queue.get()  # 阻塞等待子进程产出的数据
        if item is None:
            break
        yield item

    p.join()
@upload_bp.route('/upload', methods=['POST'])
@validate_and_setup_logger
# @require_connection_limit(timeout=1800)
@require_execution_timeout(timeout=1800)
def zbparse():                               #大解析
    logger = g.logger
    try:
        logger.info("大解析开始!!!")
        received_data = request.get_json()
        logger.info("Received JSON data: " + str(received_data))
        file_name = urllib.parse.unquote(file_url).split('/')[-1]
        logger.info(f"Starting parsing file: {file_name}")
        file_url = g.file_url
        zb_type = g.zb_type
        try:
            logger.info("starting parsing url:" + file_url)
            return process_and_stream(file_url, zb_type)     #主要执行函数
        except Exception as e:
            logger.error('Exception occurred: ' + str(e))
            if hasattr(g, 'unique_id'):
                log_error_unique_id(g.unique_id,1,file_name=file_name)
            error_response = create_response(
                message='处理文件时发生异常',
                status='error',
                data=str(e)
            )
            return jsonify(error_response)
    except Exception as e:
        logger.error('Unexpected exception: ' + str(e))
        if hasattr(g, 'unique_id'):
            log_error_unique_id(g.unique_id,1,file_name=file_name)
        error_response = create_response(
            message='内部服务器错误',
            status='error',
            data='Internal server error'
        )
        return jsonify(error_response)
def process_and_stream(file_url, zb_type):
    """
    下载文件并进行处理，支持工程标和货物标的处理。
    """
    logger = g.logger
    unique_id = g.unique_id
    output_folder = g.output_folder
    filename = "ztbfile"
    downloaded_filename = os.path.join(output_folder, filename)
    start_time = time.time()
    file_name = urllib.parse.unquote(file_url).split('/')[-1]
    try:
        downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
        if not downloaded_filepath or file_type == 4:
            logger.error("下载文件失败或不支持的文件类型")
            log_error_unique_id(unique_id, 1, file_name=file_name)
            error_response = create_response(
                message='下载文件失败或不支持的文件类型',
                status='error',
                data=''
            )
            yield sse_format(error_response)
            return

        logger.info("本地文件路径: " + downloaded_filepath)

        combined_data = {}
        good_list = None

        processing_functions = {
            1: engineering_bid_main,    #工程标解析
            2: goods_bid_main           #货物标解析/服务标解析
        }
        processing_func = processing_functions.get(zb_type, goods_bid_main)

        for data in run_in_subprocess(processing_func, output_folder, downloaded_filepath, file_type, unique_id):    #逐一接收货物标 工程标解析内容，为前端网页展示服务
            if not data.strip():
                logger.error("Received empty data, skipping JSON parsing.")
                continue

            try:
                parsed_data = json.loads(data)
            except json.JSONDecodeError as e:
                logger.error(f"Failed to decode JSON: {e}")
                logger.error(f"Data received: {data}")
                continue
            if 'error' in parsed_data:
                error_message = parsed_data['error']
                logger.error(f"Processing terminated due to error: {error_message}")
                # 使用指定的格式返回错误响应
                error_response = create_response(
                    message=error_message,
                    status='error',
                    data=''
                )
                yield sse_format(error_response)
                return  # 终止进一步处理

            if 'good_list' in parsed_data:    #货物列表
                good_list = parsed_data['good_list']
                logger.info("Collected good_list from the processing function: %s", good_list)
                continue

            for outer_key, inner_dict in parsed_data.items():
                if isinstance(inner_dict, dict):
                    combined_data.update(inner_dict)

            response = create_response(
                message='Processing',
                status='success',
                data=data
            )
            yield sse_format(response)     #返回给后端->前端展示

        base_end_time = time.time()
        logger.info(f"分段解析完成，耗时：{base_end_time - start_time:.2f} 秒")
        #此时前端已完整接收到解析的所有内容，后面的内容与前端展示无关，主要是后处理：1.extracted_result，关键信息存储 2.技术偏离表 3.商务偏离表 4.投标人需提交的证明材料（目前后端存储了，前端还未展示）
        #后处理开始！！！
        output_json_path = os.path.join(output_folder, 'final_result.json')
        extracted_info_path = os.path.join(output_folder, 'extracted_result.json')
        includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
        final_result, extracted_info, tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation, proof_materials = outer_post_processing(
            combined_data, includes, good_list)    #后处理 生成 extracted_info、商务 技术偏离数据 以及证明材料返给后端

        #后处理完毕！后面都是生成响应返回，不额外修改数据
        tech_deviation_response, tech_deviation_star_response, zigefuhe_deviation_response, shangwu_deviation_response, shangwu_star_deviation_response, proof_materials_response = generate_deviation_response(
            tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation,
            proof_materials, logger)    #生成规范的响应

        # 使用通用响应函数
        yield sse_format(tech_deviation_response)
        yield sse_format(tech_deviation_star_response)
        yield sse_format(zigefuhe_deviation_response)
        yield sse_format(shangwu_deviation_response)
        yield sse_format(shangwu_star_deviation_response)
        yield sse_format(proof_materials_response)

        try:
            with open(extracted_info_path, 'w', encoding='utf-8') as json_file:
                json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
            logger.info(f"摘取后的数据已保存到 '{extracted_info_path}'")
        except IOError as e:
            logger.error(f"保存JSON文件时出错: {e}")
            log_error_unique_id(unique_id,1,file_name=file_name)  # 记录失败的 unique_id

        try:
            with open(output_json_path, 'w', encoding='utf-8') as json_file:
                json.dump(final_result, json_file, ensure_ascii=False, indent=4)
            logger.info(f"合并后的数据已保存到 '{output_json_path}'")
        except IOError as e:
            logger.error(f"保存JSON文件时出错: {e}")
            log_error_unique_id(unique_id,1,file_name=file_name)  # 记录失败的 unique_id

        extracted_info_response = create_response(
            message='extracted_info',
            status='success',
            data=json.dumps(extracted_info, ensure_ascii=False)
        )
        yield sse_format(extracted_info_response)

        complete_response = create_response(
            message='Combined_data',
            status='success',
            data=json.dumps(final_result, ensure_ascii=False)
        )
        yield sse_format(complete_response)

        final_response = create_response(     #目前后端的逻辑是读取到'data'中有个'END',就终止连接
            message='文件上传并处理成功',
            status='success',
            data='END'
        )
        yield sse_format(final_response)

    except Exception as e:
        logger.error(f"Unexpected error in process_and_stream: {e}")
        log_error_unique_id(unique_id,1,file_name=file_name)  # 记录失败的 unique_id
        error_response = create_response(
            message='内部服务器错误',
            status='error',
            data=''
        )
        yield sse_format(error_response)

    finally:
        end_time = time.time()
        duration = end_time - start_time
        logger.info(f"Total processing time: {duration:.2f} seconds")