10.25切分改为多线程

2024-10-25 17:50:20 +08:00 · 2024-10-25 17:50:20 +08:00 · e73125f097
commit e73125f097
parent 6ecc370d27
7 changed files with 115 additions and 41 deletions
--- a/flask_app/general/format_date.py
+++ b/flask_app/general/format_date.py
@ -13,6 +13,11 @@ def format_chinese_date(date_str):
        str: 格式化后的日期字符串，例如 "2019-07-18 09:30:00"
        如果格式错误，返回 None 并打印错误信息。
    """
+    # 检查输入类型
+    if not isinstance(date_str, str):
+        print(f"输入类型错误: 期望字符串类型，实际类型为 {type(date_str)}")
+        return None
+
    # print("------------")
    # print(f"原始输入: {date_str}")

@ -104,6 +109,7 @@ def format_chinese_date(date_str):
 if __name__ == "__main__":
    input_dates = [
        # 完整的日期和时间
+        ["www"],
        "2019年7月18日09：30",
        "20 19 年7 月18日 09： 30整（北京时间）",
        "2020年02月05日12时30分45秒",
--- a/flask_app/general/post_processing.py
+++ b/flask_app/general/post_processing.py
@ -97,18 +97,22 @@ def inner_post_processing(base_info):
            str: 转换后的字符串。
        """
        if isinstance(value, list):
-            concatenated = []
-            for item in value:
-                if isinstance(item, dict):
-                    pairs = []
-                    for k, v in item.items():
-                        if v not in ["未知", ""]:
-                            pairs.append(f"{k}:{v}")
-                    concatenated.append("  ".join(pairs))
-                else:
-                    # 如果列表中有非字典项，直接转换为字符串
-                    concatenated.append(str(item))
-            return "  ".join(concatenated)
+            if all(isinstance(item, str) for item in value):
+                # 如果列表中的所有元素都是字符串，则用两个空格连接
+                return "  ".join(item for item in value if item not in ["未知", ""])
+            else:
+                concatenated = []
+                for item in value:
+                    if isinstance(item, dict):
+                        pairs = []
+                        for k, v in item.items():
+                            if v not in ["未知", ""]:
+                                pairs.append(f"{k}:{v}")
+                        concatenated.append("  ".join(pairs))
+                    else:
+                        # 如果列表中有非字典且非字符串的项，直接转换为字符串
+                        concatenated.append(str(item))
+                return "  ".join(concatenated)
        elif isinstance(value, dict):
            pairs = []
            for k, v in value.items():
@ -201,7 +205,7 @@ def inner_post_processing(base_info):

    return extracted_info

-def outer_post_processing(combined_data, includes):
+def outer_post_processing(combined_data, includes,good_list):
    """
    外层处理函数，调用内层 post_processing 处理 '基础信息'，并构建 processed_data。
    额外提取 '采购要求' 下的 '技术要求' 内容。
@ -259,7 +263,7 @@ if __name__ == "__main__":
    combined_data={
    "基础信息": {
        "招标人/代理信息": {
-            "招标人": "广水市公路管理局",
+            "招标人": ["广水市公路管理局","sss"],
            "招标人联系方式": {
                "名称": "广水市公路管理局",
                "联系电话": "17362698785",
@ -695,9 +699,9 @@ if __name__ == "__main__":
            "其他要求": ""
        },
        "关键时间/内容": {
-            "投标文件递交截止日期": "2021年月日点分",
+            "投标文件递交截止日期": "2021年12月1日点分",
            "投标文件递交地点": "广水市公共资源交易中心五楼号开标室",
-            "开标时间": "未知",
+            "开标时间": "www",
            "开标地点": "广水市公共资源交易中心五楼号开标室",
            "澄清招标文件的截止时间": "未知",
            "投标有效期": "提交投标文件截止之日起 60日历日",
@ -952,5 +956,5 @@ if __name__ == "__main__":
    includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
    res1,res2,res3=outer_post_processing(combined_data,includes)
    # print(json.dumps(res2,ensure_ascii=False,indent=4))
-    print(json.dumps(res2,ensure_ascii=False,indent=4))
+    print(json.dumps(res3,ensure_ascii=False,indent=4))

--- a/flask_app/general/截取文件格式.py
+++ b/flask_app/general/截取文件格式.py
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@ -11,7 +11,7 @@ from flask_app.main.download import download_file
 from flask_app.general.post_processing import outer_post_processing
 from flask_app.main.工程标解析main import engineering_bid_main
 from flask_app.货物标.货物标解析main import goods_bid_main
-from flask_app.货物标.技术要求提取 import get_technical_requirements_main
+from flask_app.货物标.技术参数要求提取 import get_technical_requirements_main
 app = Flask(__name__)

 class CSTFormatter(logging.Formatter):
@ -337,6 +337,7 @@ def process_and_stream(file_url, zb_type):
        logger.info("本地文件路径: " + downloaded_filepath)

        combined_data = {}
+        good_list = None

        # 根据zb_type选择调用的处理函数
        processing_functions = {
@ -358,6 +359,11 @@ def process_and_stream(file_url, zb_type):
                logger.error(f"Data received: {data}")
                continue  # Skip data if JSON parsing fails

+            if 'good_list' in parsed_data:
+                good_list = parsed_data['good_list']
+                logger.info("Collected good_list from the processing function.")
+                continue  # Skip yielding good_list to the client
+
            # 遍历 parsed_data 只提取内层内容进行合并
            for outer_key, inner_dict in parsed_data.items():
                if isinstance(inner_dict, dict):
@ -379,7 +385,7 @@ def process_and_stream(file_url, zb_type):
        output_json_path = os.path.join(output_folder, 'final_result.json')
        extracted_info_path=os.path.join(output_folder, 'extracted_result.json')
        includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
-        final_result, extracted_info,procurement_reqs = outer_post_processing(combined_data, includes)
+        final_result, extracted_info,procurement_reqs = outer_post_processing(combined_data, includes,good_list)

        logger.info(f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}")  # 添加日志记录
        #采购需求
@ -404,7 +410,7 @@ def process_and_stream(file_url, zb_type):
        except IOError as e:
            logger.error(f"保存JSON文件时出错: {e}")

-        #截取的数据
+        #提取的数据
        extracted_info_response = {
            'message': 'extracted_info',
            'filename': os.path.basename(downloaded_filepath),
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -8,37 +8,60 @@ from flask_app.general.json_utils import clean_json_string, combine_json_results
 from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main

 def generate_key_paths(data, parent_key=''):
+    """
+    生成嵌套字典中的键路径，并提取最内层的键名。
+
+    参数:
+    data (dict): 输入的字典数据
+    parent_key (str): 上级键路径，用于递归调用
+
+    返回:
+    tuple: 包含键路径列表和最内层键名列表的元组
+           (key_paths, good_list)
+    """
    key_paths = []
+    good_list = []
    no_keys_added = True  # 默认假设没有添加任何键

    for key, value in data.items():
+        # 构建当前的键路径
        current_key = f"{parent_key}.{key}" if parent_key else key
+
        if isinstance(value, dict):
            if value:
-                # 递归调用，并更新 no_keys_added 状态
-                sub_paths, sub_no_keys_added = generate_key_paths(value, current_key)
-                key_paths.extend(sub_paths)
+                # 递归调用，并获取子路径和子 good_list
+                sub_key_paths, sub_good_list, sub_no_keys_added = generate_key_paths(value, current_key)
+                key_paths.extend(sub_key_paths)
+                good_list.extend(sub_good_list)
                no_keys_added = no_keys_added and sub_no_keys_added
            else:
-                # 空字典也视为未添加键
+                # 空字典视为叶子节点
                key_paths.append(current_key)
+                good_list.append(key)
                no_keys_added = False
        elif isinstance(value, list):
-            # 列表只在非空时视为添加了键
            if value:
+                # 非空列表视为叶子节点
                key_paths.append(current_key)
+                good_list.append(key)
                no_keys_added = False
            else:
+                # 空列表也视为叶子节点（根据需求可以调整）
+                key_paths.append(current_key)
+                good_list.append(key)
                no_keys_added = False
-        elif value == "未知" or value == "" or value == "/":
-            # 处理为"未知"或空字符串
+        elif value in {"未知", "", "/"}:
+            # 特定值视为叶子节点
            key_paths.append(current_key)
+            good_list.append(key)
            no_keys_added = False
        else:
-            # 值不是字典、列表、"未知"或空字符串，也不添加到键路径
-            no_keys_added = True and no_keys_added  # 只保持 True 如果之前所有键都未添加
+            # 其他情况不视为叶子节点
+            key_paths.append(current_key)
+            good_list.append(key)
+            no_keys_added = False

-    return key_paths, no_keys_added
+    return key_paths, good_list, no_keys_added

 def combine_and_update_results(original_data, updates):
    def recursive_update(data, key, value):
@ -65,6 +88,7 @@ def postprocess(data):

    # 递归处理顶层数据
    return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
+
 def get_technical_requirements(file_id):
    user_query1 = """
    这是一份货物标中采购要求部分的内容，请告诉我需要采购的系统（或货物），如果有采购清单，请直接根据清单上的货物名称给出结果，若没有采购清单，你要从文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，如某大系统中包含若干货物，那么需要用嵌套键值对表示这种关系，请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
@ -82,9 +106,9 @@ def get_technical_requirements(file_id):
    
    """
    res = qianwen_long(file_id, user_query1)
-    cleaned_res = clean_json_string(res)
-    print(res)
-    keys_list ,no_keys_added= generate_key_paths(cleaned_res['采购需求'])  # 提取需要采购的货物清单
+    cleaned_res = clean_json_string(res)     #转字典
+    # print(res)
+    keys_list,good_list,no_keys_added= generate_key_paths(cleaned_res['采购需求'])  # 提取需要采购的货物清单
    if '采购需求' in cleaned_res:
        cleaned_res['技术要求'] = cleaned_res.pop('采购需求')
    if no_keys_added:
@ -111,6 +135,7 @@ def get_technical_requirements(file_id):
        # 更新原始采购需求字典
        combine_and_update_results(cleaned_res['技术要求'], technical_requirements_combined_res)
        final_res = postprocess(cleaned_res)
+        final_res['技术要求']["货物列表"] = good_list     #添加需要采购的货物
    # 输出最终的 JSON 字符串
    return final_res

@ -148,11 +173,12 @@ def get_technical_requirements_main(file_path,output_folder):
    else:
        return final_res
 if __name__ == "__main__":
-    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_procurement.pdf"
-    truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\217754b7-3efd-41b2-806b-0b5b1bc98904\\ztbfile.pdf"
-    output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\217754b7-3efd-41b2-806b-0b5b1bc98904\\tmp"
+    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_procurement.pdf"
+    # truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\217754b7-3efd-41b2-806b-0b5b1bc98904\\ztbfile.pdf"
+    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
    file_id = upload_file(truncate_file)
-    res=get_technical_requirements_main(truncate_file,output_folder)
+    res=get_technical_requirements(file_id)
+    # res=get_technical_requirements_main(truncate_file,output_folder)
    json_string = json.dumps(res, ensure_ascii=False, indent=4)
    print(json_string)
    # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -1,7 +1,7 @@
 import concurrent.futures
 import json
 import time
-from flask_app.货物标.技术要求提取 import get_technical_requirements
+from flask_app.货物标.技术参数要求提取 import get_technical_requirements
 from flask_app.general.通义千问long import upload_file
 from flask_app.货物标.商务服务其他要求提取 import get_business_requirements

@ -38,7 +38,8 @@ def fetch_procurement_reqs(truncate_file):
            "技术要求": technical_requirements.get("技术要求", {}),
            "商务要求": business_requirements.get("商务要求", {}),
            "服务要求": business_requirements.get("服务要求", {}),
-            "其他要求": business_requirements.get("其他要求", {})
+            "其他要求": business_requirements.get("其他要求", {}),
+            "货物列表":business_requirements.get("货物列表",{})
        }

        return procurement_reqs
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -81,8 +81,9 @@ def preprocess_files(output_folder, file_path, file_type):
 def fetch_project_basic_info(merged_baseinfo_path, procurement_file_path):  # 投标人须知前附表
    logger.info("starting基础信息...")
    basic_res = combine_basic_info(merged_baseinfo_path, procurement_file_path)
+    base_info,good_list=post_process_baseinfo(basic_res)
    logger.info("基础信息done")
-    return basic_res
+    return base_info,good_list


 def fetch_qualification_review(output_folder, qualification_path, notice_path,merged_baseinfo_path):  # 资格审查
@ -130,6 +131,30 @@ def fetch_bid_opening(clause_path):
    return {"开评定标流程": fetch_bid_opening_json}


+def post_process_baseinfo(base_info):
+    """
+    在 'base_info' 任务完成后执行的函数。
+    确保在缺少某些键时，返回 good_list=[]。
+
+    参数：
+    - base_info (dict): 原始的 base_info 数据。
+
+    返回：
+    - tuple: (处理后的 base_info, good_list)
+    """
+    try:
+        # 尝试提取 '货物列表'，若中间某个键不存在，返回 good_list=[]
+        print(json.dumps(base_info,ensure_ascii=False,indent=4))
+        procurement_reqs = base_info.get('采购要求', {})
+        technical_requirements = procurement_reqs.get('技术要求', {})
+        good_list = technical_requirements.pop('货物列表', [])  # 如果 '货物列表' 不存在，返回 []
+
+        logger.info(f"Extracted good_list: {good_list}")
+        return base_info, good_list
+    except Exception as e:
+        logger.error(f"Error in post_process_baseinfo: {e}")
+        return base_info, []  # 返回空列表
+
 def goods_bid_main(output_folder, file_path, file_type, unique_id):
    global logger
    logger = get_global_logger(unique_id)
@ -162,6 +187,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
            key = next(k for k, v in futures.items() if v == future)
            try:
                result = future.result()
+                if key == 'base_info':
+                    base_info, good_list = result
+                    collected_good_list = good_list  # Store good_list for later use
+                    yield json.dumps({'base_info': transform_json_values(base_info)}, ensure_ascii=False)
                # 如果是 evaluation_standards，拆分技术标和商务标
                if key == 'evaluation_standards':
                    technical_standards = result["technical_standards"]
@ -177,10 +206,12 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
            except Exception as exc:
                logger.error(f"Error processing {key}: {exc}")
                yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
+        if collected_good_list is not None:
+            yield json.dumps({'good_list': transform_json_values(collected_good_list)}, ensure_ascii=False)

-#TODO:目前的无效标这块的键值都删去空格了，所有的键名都删去空格
 #广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
 #TODO:区分output目录    陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf提取有问题    #一包二包问题 107国道     #日期格式统一
+#good_list 金额  截取上下文
 if __name__ == "__main__":
    import time