9.23测试分段

2024-09-23 15:49:30 +08:00 · 2024-09-23 15:49:30 +08:00 · 60f17e32bf
commit 60f17e32bf
parent 8789bb540a
11 changed files with 230 additions and 40 deletions
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@ -261,7 +261,20 @@ def test_process_and_stream():
            'data': data
        }
        yield f"data: {json.dumps(response, ensure_ascii=False)}\n\n"
-        time.sleep(5)  # 每隔5秒发送一段数据
+        time.sleep(3)  # 每隔5秒发送一段数据
+
+    # 在结束信号之前发送完整的数据
+    combined_data = {}
+    for segment in data_segments:
+        combined_data.update(segment)  # 将所有段落数据合并成一个大字典
+
+    # 发送完整的大字典
+    complete_response = {
+        'message': 'Combined data',
+        'filename': filename,
+        'data': combined_data
+    }
+    yield f"data: {json.dumps(complete_response, ensure_ascii=False)}\n\n"

    # 发送结束信号
    final_response = {
--- a/flask_app/main/商务标技术标整合.py
+++ b/flask_app/main/商务标技术标整合.py
@ -107,7 +107,7 @@ def combine_evaluation_standards(truncate2):
    update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1)
    evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4)
    return evaluation_combined_res
-
+    # return update_json              #商务标技术标整合
 if __name__ == "__main__":
    truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_evaluation_method.pdf"
    res=combine_evaluation_standards(truncate2)
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@ -110,6 +110,18 @@ def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_j


 # 评分细则
+# def fetch_evaluation_standards(truncate1):  # 评标办法前附表
+#     logger.info("starting 商务标和技术标...")
+#
+#     # 获取评标办法前附表的字典结果
+#     evaluation_standards_res = combine_evaluation_standards(truncate1)
+#     # 从结果中提取"商务标"和"技术标"
+#     technical_standards = evaluation_standards_res.get("技术标", {})
+#     commercial_standards = evaluation_standards_res.get("商务标", {})
+#     logger.info("商务标和技术标 done")
+#     # 返回技术标和商务标
+#     return technical_standards, commercial_standards
+
 def fetch_evaluation_standards(truncate1):  # 评标办法前附表
    logger.info("starting商务标技术标...")
    evaluation_standards_res = combine_evaluation_standards(truncate1)
@ -219,7 +231,9 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
 #                                                 processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
 #                                                 processed_data['clause_path'], processed_data['input_file_path'],
 #                                                 processed_data['output_folder']),
-#             'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]),
+#             # 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]),
+#             'technical_standards': executor.submit(lambda: fetch_evaluation_standards(processed_data['truncate_files'][1])[0]),  # 技术标
+#             'commercial_standards': executor.submit(lambda: fetch_evaluation_standards(processed_data['truncate_files'][1])[1]),  # 商务标
 #             'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
 #                                                     output_folder, processed_data['truncate0_jsonpath'],
 #                                                     processed_data['clause_path'], processed_data['truncate_files'][4]),
@ -252,6 +266,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
 #     # 删除知识索引
 #     deleteKnowledge(processed_data['knowledge_index'])

+#TODO:流式输出要改商务标技术标整合
 if __name__ == "__main__":
    output_folder = "flask_app/static/output/zytest1"

--- a/flask_app/main/资格审查模块.py
+++ b/flask_app/main/资格审查模块.py
@ -32,9 +32,9 @@ if __name__ == "__main__":
    input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\4a611a66-0dac-4daf-b365-c6167c5b8c8d\\ztbfile.pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4a611a66-0dac-4daf-b365-c6167c5b8c8d"
    # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
-    truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf")
+    truncate2=os.path.join(output_folder,"ztbfile_qualification.pdf")
    knowledge_name="zbtest20"
-    truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
+    truncate1=os.path.join(output_folder,"ztbfile_qualification.pdf")
    truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf")
    clause_path = convert_clause_to_json(truncate2, output_folder)
    truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
--- a/flask_app/main/资格评审.py
+++ b/flask_app/main/资格评审.py
@ -110,13 +110,17 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
                    for question, response in results2:
                        cleaned_res = clean_json_string(response)
                        res_list.append(cleaned_res)  # 都是问资格评审表得出的
+                if res_list:
                    merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
                    consortium_dict = get_consortium_dict(knowledge_name)
-                updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)  # 合并字典
+                    updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)
                    return updated_qualify_json
+                else:
+                    print("资格评审： 无法获取大模型结果，返回空值")
+                    return {"资格评审": ""}
            else:
                print("资格评审： type2")
-                return get_all_dict(knowledge_name)
+                return get_all_dict(knowledge_name) or {"资格评审": ""}

        else:
            print("资格评审： type3")
@ -128,13 +132,13 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
                final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict)
                return final_qualify_json
            else:
-                return new_non_matching_json
+                return new_non_matching_json or {"资格评审": ""}

    elif matching_keys_list and truncate3=="":    #这种情况是评分办法前附表中有要求，但是没有正确截取到'资格审查表'
        print("资格评审： type4")
        final_qualification=get_all_dict(knowledge_name)
        final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
-        return final_qualify_json
+        return final_qualify_json or {"资格评审": ""}
    else:                                       #大多数情况
        print("资格评审： type5")
        user_querys = generate_qual_question(matching_keys_list)  # 生成提问->‘附件：资格审查’
@ -143,6 +147,7 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
        res_list = []
        if not results2:
            print("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
+            return {"资格评审": ""}
        else:
            # 打印结果
            for question, response in results2:
@ -152,7 +157,7 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
        consortium_dict=get_consortium_dict(knowledge_name)
        updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)  # 合并字典
        final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
-        return final_qualify_json
+        return final_qualify_json or {"资格评审": ""}

 if __name__ == "__main__":
    # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年（2018 年、2019 年、2020 年）完', '类似项目业绩': '投标人近 5 年（2017 年至今）须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩，同时提供中标通知书及监理服务合同，项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师：至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'}
--- a/flask_app/货物标/test.py
+++ b/flask_app/货物标/test.py
@ -1,3 +1,108 @@
-from flask_app.main.json_utils import nest_json_under_key
-res=nest_json_under_key("","资格")
-print(res)
+import json
+import re
+#这个字典可能有嵌套，你需要遍历里面的键名，对键名作判断，而不是键值，具体是这样的：如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除，重新组织成一个字典格式的数据，你可以考虑用字符串列表来保持部分平级的数据
+#对于同级的键，如果数量>1且键名都统一，那么将键名去掉，用列表保持它们的键值
+#对于同一个字典中，可能存在若干键值对，若它们的键值都是""或者"/" 你就将它们的键值删去，它们的键名用字符串列表保存
+def is_numeric_key(key):
+    # 这个正则表达式匹配由数字、点、括号中的数字或单个字母（小写或大写）组成的字符串，
+    # 字母后跟数字，或数字后跟字母，单个字母后跟点，但不能是字母-数字-字母的组合
+    pattern = r'^[\d.]+$|^\(\d+\)$|^（\d+）$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$'
+    return re.match(pattern, key) is not None
+#TODO:如果键值中存在数字就不行
+#zbtest20也有问题
+def contains_number_or_index(key, value):
+    # 判断值是否是数字或数字字符串
+    is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())
+    # 判断键是否包含 "序号"
+    contains_index = '序号' in key
+    # 判断值中是否包含数字
+    contains_digit = isinstance(value, str) and re.search(r'\d+', value)
+    # 判断值中是否包含中文字符
+    contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value)
+    # 如果值中包含数字但也有中文字符，则保留（返回 False）
+    if contains_digit and contains_chinese:
+        return False
+    # 如果值是数字或包含数字，且不包含中文字符，或者键包含 "序号"，返回 True
+    return is_number or contains_index or contains_digit
+
+def preprocess_dict(data):
+    if isinstance(data, dict):
+        if len(data) > 1:
+            # 检查是否所有值都是 "" 或 "/"
+            if all(v == "" or v == "/" for v in data.values()):
+                return list(data.keys())
+            else:
+                processed = {}
+                for k, v in data.items():
+                    if not contains_number_or_index(k, v):
+                        processed_v = preprocess_dict(v)
+                        if processed_v != "":  # 只添加非空值
+                            processed[k] = processed_v
+                return processed
+        else:
+            return {k: preprocess_dict(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [preprocess_dict(item) for item in data]
+    else:
+        return data
+def process_dict(data):
+    if not isinstance(data, dict):
+        return data
+
+    result = {}
+    numeric_keys = []
+    non_numeric_keys = {}
+
+    for key, value in data.items():
+        if is_numeric_key(key):
+            numeric_keys.append((key, value))
+        else:
+            non_numeric_keys[key] = value
+
+    # 处理数字键，不再要求数量>1
+    if numeric_keys:
+        result['items'] = [process_dict(item[1]) for item in sorted(numeric_keys)]
+
+    # 处理非数字键
+    for key, value in non_numeric_keys.items():
+        if isinstance(value, list) and len(value) > 1 and all(isinstance(item, dict) and len(item) == 1 for item in value):
+            common_key = next(iter(value[0].keys()))
+            if all(common_key in item and len(item) == 1 for item in value):
+                result[key] = [process_dict(item[common_key]) for item in value]
+            else:
+                result[key] = [process_dict(item) for item in value]
+        else:
+            result[key] = process_dict(value)
+
+    # 如果结果只包含'items'键，直接返回其值
+    if len(result) == 1 and 'items' in result:
+        return result['items']
+
+    return result
+
+# 测试代码
+
+input_data = {
+    "符合性审查": {
+        "说明": "评标委员会应当对符合资格的投标人的投标文件进行符合性审查，以确定其是否满足招标文件的实质性要求。",
+        "审查标准": [
+            {
+                "序号": 9,
+                "内容": "未按要求提供加盖公章及签字（签章）的；"
+            },
+            {
+                "序号": 1,
+                "内容": "a39"
+            },
+            {
+                "序号": 2,
+                "内容": "依据财库[2019]9号文的规定，招标文件采购清单中为“节能产品”的货物，未提供国家确定的认证机构出具的节能产品认证证书的；"
+            },
+
+        ]
+    }
+}
+pred=preprocess_dict(input_data)
+print(json.dumps(pred, ensure_ascii=False, indent=4))
+# processed_data = process_dict(pred)
+# print(json.dumps(processed_data, ensure_ascii=False, indent=4))
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -72,9 +72,8 @@ def generate_queries(truncate_file, required_keys):
    return queries


-def get_business_requirements(truncate_file):
+def get_business_requirements(truncate_file,file_id):
    required_keys = ["商务要求", "服务要求", "其他要求"]
-    file_id = upload_file(truncate_file)
    queries = generate_queries(truncate_file, required_keys)
    results = multi_threading(queries, "", file_id, 2)
    business_requirements = [res for _, res in results] if results else []
@ -83,9 +82,11 @@ def get_business_requirements(truncate_file):
    final_res = combine_json_results(business_requirements)
    final_res.update({key: final_res.get(key, "") for key in required_keys})

-    return json.dumps(final_res, ensure_ascii=False, indent=4)
+    return final_res


 if __name__ == "__main__":
    truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
-    print(get_business_requirements(truncate_file))
+    file_id = upload_file(truncate_file)
+    res=get_business_requirements(truncate_file,file_id)
+    print(json.dumps(res, ensure_ascii=False, indent=4))
--- a/flask_app/货物标/技术要求提取.py
+++ b/flask_app/货物标/技术要求提取.py
@ -97,8 +97,7 @@ def get_technical_requirements(file_id):
        final_res = postprocess(cleaned_res)
        print("更新后的采购需求处理完成.")
    # 输出最终的 JSON 字符串
-    json_string = json.dumps(final_res, ensure_ascii=False, indent=4)
-    return json_string
+    return final_res

 def test_all_files_in_folder(input_folder, output_folder):
    # 确保输出文件夹存在
@ -128,7 +127,8 @@ if __name__ == "__main__":
    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
    file_id = upload_file(truncate_file)
    res=get_technical_requirements(file_id)
-    print(res)
+    json_string = json.dumps(res, ensure_ascii=False, indent=4)
+    print(json_string)
    # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
    # test_all_files_in_folder(input_folder, output_folder)
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -1,26 +1,40 @@
+import concurrent.futures
 import json
+import time

-from flask_app.货物标.货物标截取pdf import truncate_pdf_main
-from flask_app.main.format_change import docx2pdf, pdf2docx
 from flask_app.货物标.技术要求提取 import get_technical_requirements
 from flask_app.main.通义千问long import upload_file
+from 商务服务其他要求提取 import get_business_requirements
+from flask_app.main.json_utils import nest_json_under_key
 #获取采购清单
-def fetch_purchasing_list(file_path,output_folder,file_type):
-    if file_type==1:
-        docx_path=file_path
-        pdf_path = docx2pdf(file_path)
-    elif file_type==2:
-        pdf_path=file_path
-        docx_path=pdf2docx(file_path)
-    else:
-        print("未传入指定格式的文件！")
-        return None
-    truncate_file=truncate_pdf_main(pdf_path,output_folder,1)
+def fetch_procurement_reqs(truncate_file):
+    # 上传文件并获取 file_id
    file_id = upload_file(truncate_file)
-    tech_reqs=get_technical_requirements(file_id)
+
+    # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # 提交任务给线程池
+        future_technical = executor.submit(get_technical_requirements, file_id)
+        time.sleep(1)
+        future_business = executor.submit(get_business_requirements, truncate_file, file_id)
+
+        # 获取并行任务的结果
+        technical_requirements = future_technical.result()
+        business_requirements = future_business.result()
+    # 构建最终的嵌套结构，确保四个键平级
+    procurement_reqs = {
+        "采购要求": {
+            "技术要求": technical_requirements.get("技术要求", {}),
+            "商务要求": business_requirements.get("商务要求", {}),
+            "服务要求": business_requirements.get("服务要求", {}),
+            "其他要求": business_requirements.get("其他要求", {})
+        }
+    }
+
+    return procurement_reqs

 if __name__ == "__main__":
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
-    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
-    fetch_purchasing_list(file_path,output_folder,1)
-
+    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_procurement.pdf"
+    res=fetch_procurement_reqs(file_path)
+    print(json.dumps(res, ensure_ascii=False, indent=4))
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -0,0 +1,13 @@
+from flask_app.main.format_change import docx2pdf, pdf2docx
+from flask_app.货物标.货物标截取pdf import truncate_pdf_main
+def main_processing(output_folder,file_path,file_type, unique_id):
+    if file_type==1:
+        docx_path=file_path
+        pdf_path = docx2pdf(file_path)
+    elif file_type==2:
+        pdf_path=file_path
+        docx_path=pdf2docx(file_path)
+    else:
+        print("未传入指定格式的文件！")
+        return None
+    truncate_file=truncate_pdf_main(pdf_path,output_folder,1)
--- a/flask_app/货物标/资格审查main.py
+++ b/flask_app/货物标/资格审查main.py
@ -0,0 +1,24 @@
+import json
+
+from flask_app.main.基础信息整合 import combine_basic_info
+from flask_app.main.通义千问long import qianwen_long,upload_file
+from flask_app.main.多线程提问 import multi_threading
+from flask_app.main.json_utils import combine_json_results
+def qualification_review(truncate_file):
+    file_id=upload_file(truncate_file)
+    user_query1="该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，对于原文中的序号，你仅需要捕获它们之间的层级关系并根据序号后的内容生成嵌套键值对，若多个内容位于同一层级，你应用字符串列表作为键值保存这些内容，你的回答需删去这些序号，但其余内容要与原文一致，不可擅自总结删减，也不要回答符合性审查的内容。"
+    user_query2="该招标文件中规定的符合性审查标准是怎样的？请以json格式给出，外层为'符合性审查'，你的回答要与原文一致，不可擅自总结删减，也不要回答资格性审查的内容。"
+    user_query=[]
+    user_query.append(user_query1)
+    user_query.append(user_query2)
+    results=multi_threading(user_query,"",file_id,2)
+    result_list=[]
+    for question, response in results:
+        result_list.append(response)
+    combined_res = combine_json_results(result_list)  # 整合基础信息核心代码
+    return combined_res
+
+if __name__ == "__main__":
+    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_qualification1.pdf"
+    res=qualification_review(truncate_file)
+    print(json.dumps(res,ensure_ascii=False, indent=4))