zbparse/flask_app/general/纯技术参数要求提取.py

import json
import logging
import re
import string

from flask_app.general.format_change import pdf2docx, docx2pdf
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
from flask_app.货物标.技术参数要求提取 import get_technical_requirements

def get_global_logger(unique_id):
    if unique_id is None:
        return logging.getLogger()  # 获取默认的日志器
    logger = logging.getLogger(unique_id)
    return logger


logger = None
def extract_matching_keys(data_dict, good_list):
    """
    递归遍历data_dict，查找good_list中存在的键（完全匹配或以good_list中的键开头，后跟“-数字”），并将匹配的键及其值添加到结果字典中。
    对于重复的键名，添加后缀 -a, -b, -c 等以确保唯一性。

    参数:
    - data_dict (dict): 要遍历的嵌套字典。
    - good_list (list): 包含要查找的键的列表。

    返回:
    - dict: 包含所有匹配键及其值的字典。
    """
    result = {}
    key_count = {}  # 用于统计每个匹配键的出现次数

    # 预编译正则模式以提高效率
    patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list]           #考虑同一系统下有同门设备，如交换机

    def matches(key):
        return any(pattern.match(key) for pattern in patterns)

    # 第一次遍历：统计每个匹配键的出现次数
    def first_pass(current_dict):
        if isinstance(current_dict, dict):
            for key, value in current_dict.items():
                if matches(key):
                    key_count[key] = key_count.get(key, 0) + 1
                first_pass(value)
        elif isinstance(current_dict, list):
            for item in current_dict:
                first_pass(item)

    first_pass(data_dict)

    # 初始化用于跟踪每个重复键当前使用的后缀编号
    suffix_map = {key: 0 for key, count in key_count.items() if count > 1}

    def get_suffix(count):
        """
        根据计数获取字母后缀，例如：         考虑不同系统下都有同门设备，如交换机
        1 -> '-a'
        2 -> '-b'
        ...
        26 -> '-z'
        27 -> '-aa'
        """
        letters = string.ascii_lowercase
        suffix = ''
        while count > 0:
            count, remainder = divmod(count - 1, 26)
            suffix = letters[remainder] + suffix
        return '-' + suffix

    # 第二次遍历：添加后缀并构建结果字典
    def recurse(current_dict):
        if isinstance(current_dict, dict):
            for key, value in current_dict.items():
                if matches(key):
                    if key_count.get(key, 0) > 1:
                        suffix_map[key] += 1
                        suffix = get_suffix(suffix_map[key])
                        new_key = f"{key}{suffix}"
                    else:
                        new_key = key
                    result[new_key] = value
                recurse(value)
        elif isinstance(current_dict, list):
            for item in current_dict:
                recurse(item)
        # 如果current_dict不是dict或list，则无需进一步处理

    recurse(data_dict)
    return result

def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
    global logger
    logger = get_global_logger(unique_id)
    if file_type == 1:  # docx
        docx_path = file_path
        pdf_path = docx2pdf(docx_path)  # 将docx转换为pdf以供后续处理
    elif file_type == 2:  # pdf
        pdf_path = file_path
    elif file_type == 3:  # doc
        pdf_path = docx2pdf(file_path)
    else:
        logger.error("Unsupported file type provided. Preprocessing halted.")
        return None
    truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
    if not truncate_file:
        truncate_file=pdf_path    #直接传整份文件
    truncate_file_docx=pdf2docx(truncate_file)
    file_id=upload_file(truncate_file_docx)
    # file_id=upload_file(truncate_file)
    final_res=get_technical_requirements(file_id,pdf_path)
    # 安全地提取 "技术要求" 内部的字典内容
    if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
        technical_requirements = final_res['技术要求']
        good_list = technical_requirements.pop('货物列表', [])  # 如果 '货物列表' 不存在，返回 []
        print(good_list)
        logger.info("Collected good_list from the processing function: %s", good_list)
        return extract_matching_keys(technical_requirements,good_list)
    else:
        return final_res
if __name__ == "__main__":
    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
    file_type=2
    output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
    res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
    print(json.dumps(res,ensure_ascii=False,indent=4))
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								import json
 								import logging
-.17 修复了提取货物标的Bug

											
										
										
											2024-11-17 16:29:02 +08:00
+								import re
 								import string
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
 								from flask_app.general.format_change import pdf2docx, docx2pdf
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								from flask_app.general.通义千问long import upload_file
 								from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
 								from flask_app.货物标.技术参数要求提取 import get_technical_requirements
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								def get_global_logger(unique_id):
 								    if unique_id is None:
 								        return logging.getLogger()  # 获取默认的日志器
 								    logger = logging.getLogger(unique_id)
 								    return logger
 								logger = None
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								def extract_matching_keys(data_dict, good_list):
 								    """
-.17 修复了提取货物标的Bug

											
										
										
											2024-11-17 16:29:02 +08:00
+								    递归遍历data_dict，查找good_list中存在的键（完全匹配或以good_list中的键开头，后跟“-数字”），并将匹配的键及其值添加到结果字典中。
 								    对于重复的键名，添加后缀 -a, -b, -c 等以确保唯一性。
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
 								    参数:
 								    - data_dict (dict): 要遍历的嵌套字典。
 								    - good_list (list): 包含要查找的键的列表。
 								    返回:
 								    - dict: 包含所有匹配键及其值的字典。
 								    """
 								    result = {}
-.17 修复了提取货物标的Bug

											
										
										
											2024-11-17 16:29:02 +08:00
+								    key_count = {}  # 用于统计每个匹配键的出现次数
 								    # 预编译正则模式以提高效率
 								    patterns = [re.compile(r'^' + re.escape(g) + r'(?:-\d+)?$') for g in good_list]           #考虑同一系统下有同门设备，如交换机
 								    def matches(key):
 								        return any(pattern.match(key) for pattern in patterns)
 								    # 第一次遍历：统计每个匹配键的出现次数
 								    def first_pass(current_dict):
 								        if isinstance(current_dict, dict):
 								            for key, value in current_dict.items():
 								                if matches(key):
 								                    key_count[key] = key_count.get(key, 0) + 1
 								                first_pass(value)
 								        elif isinstance(current_dict, list):
 								            for item in current_dict:
 								                first_pass(item)
 								    first_pass(data_dict)
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
-.17 修复了提取货物标的Bug

											
										
										
											2024-11-17 16:29:02 +08:00
+								    # 初始化用于跟踪每个重复键当前使用的后缀编号
 								    suffix_map = {key: 0 for key, count in key_count.items() if count > 1}
 								    def get_suffix(count):
 								        """
 								        根据计数获取字母后缀，例如：         考虑不同系统下都有同门设备，如交换机
 -> '-a'
 -> '-b'
 								        ...
 -> '-z'
 -> '-aa'
 								        """
 								        letters = string.ascii_lowercase
 								        suffix = ''
 								        while count > 0:
 								            count, remainder = divmod(count - 1, 26)
 								            suffix = letters[remainder] + suffix
 								        return '-' + suffix
 								    # 第二次遍历：添加后缀并构建结果字典
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								    def recurse(current_dict):
 								        if isinstance(current_dict, dict):
 								            for key, value in current_dict.items():
-.17 修复了提取货物标的Bug

											
										
										
											2024-11-17 16:29:02 +08:00
+								                if matches(key):
 								                    if key_count.get(key, 0) > 1:
 								                        suffix_map[key] += 1
 								                        suffix = get_suffix(suffix_map[key])
 								                        new_key = f"{key}{suffix}"
 								                    else:
 								                        new_key = key
 								                    result[new_key] = value
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								                recurse(value)
 								        elif isinstance(current_dict, list):
 								            for item in current_dict:
 								                recurse(item)
 								        # 如果current_dict不是dict或list，则无需进一步处理
-.17 修复了提取货物标的Bug

											
										
										
											2024-11-17 16:29:02 +08:00
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								    recurse(data_dict)
 								    return result
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
 								    global logger
 								    logger = get_global_logger(unique_id)
 								    if file_type == 1:  # docx
 								        docx_path = file_path
 								        pdf_path = docx2pdf(docx_path)  # 将docx转换为pdf以供后续处理
 								    elif file_type == 2:  # pdf
 								        pdf_path = file_path
 								    elif file_type == 3:  # doc
 								        pdf_path = docx2pdf(file_path)
 								    else:
 								        logger.error("Unsupported file type provided. Preprocessing halted.")
 								        return None
 								    truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
-.2 技术参数提取解决究极bug，增加了多线程提问中的超时重问处理

											
										
										
											2024-11-02 14:29:28 +08:00
+								    if not truncate_file:
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								        truncate_file=pdf_path    #直接传整份文件
-.29截取工程标新逻辑

											
										
										
											2024-10-29 20:40:14 +08:00
+								    truncate_file_docx=pdf2docx(truncate_file)
 								    file_id=upload_file(truncate_file_docx)
-.29截取工程标新逻辑

											
										
										
											2024-10-30 09:23:59 +08:00
+								    # file_id=upload_file(truncate_file)
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								    final_res=get_technical_requirements(file_id,pdf_path)
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								    # 安全地提取 "技术要求" 内部的字典内容
 								    if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
 								        technical_requirements = final_res['技术要求']
 								        good_list = technical_requirements.pop('货物列表', [])  # 如果 '货物列表' 不存在，返回 []
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								        print(good_list)
 								        logger.info("Collected good_list from the processing function: %s", good_list)
-.25切分改为多线程

											
										
										
											2024-10-27 12:08:54 +08:00
+								        return extract_matching_keys(technical_requirements,good_list)
 								    else:
 								        return final_res
 								if __name__ == "__main__":
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
 								    file_type=2
-.29截取工程标新逻辑

											
										
										
											2024-10-29 20:40:14 +08:00
+								    output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
-.6修复bug

											
										
										
											2024-11-06 12:20:24 +08:00
+								    res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
 								    print(json.dumps(res,ensure_ascii=False,indent=4))