9.26 分段解析完全版

2024-09-26 18:06:23 +08:00 · 2024-09-26 18:06:23 +08:00 · e6cedef2c2
commit e6cedef2c2
parent b653d97c54
7 changed files with 119 additions and 138 deletions
--- a/flask_app/main/test.py
+++ b/flask_app/main/test.py
@ -1,124 +1,19 @@
-import concurrent.futures
 import json
-import time
-import sys

-def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path):
-    time.sleep(2)  # 模拟任务耗时5秒
-    return {"basic_info": "Project basic info"}
-
-def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path, input_file_path, output_folder):
-    time.sleep(4)  # 模拟任务耗时10秒
-    return {"qualification_review": "Qualification review"}
-
-def fetch_evaluation_standards(truncate1):
-    time.sleep(6)  # 模拟任务耗时15秒
-    return {"technical_standards": "Technical standards", "commercial_standards": "Commercial standards"}
-
-def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
-    time.sleep(8)  # 模拟任务耗时20秒
-    return {"invalid_requirements": "Invalid requirements"}
-
-def fetch_bidding_documents_requirements(clause_path):
-    time.sleep(10)  # 模拟任务耗时25秒
-    return {"bidding_documents_requirements": "Bidding documents requirements"}
-
-def fetch_bid_opening(clause_path):
-    time.sleep(12)  # 模拟任务耗时30秒
-    return {"opening_bid": "Opening bid"}
-
-def transform_json_values(data):
-    # 假设这个函数对数据进行某些转换，简单返回数据
-    return data
-
-def get_global_logger(unique_id):
-    import logging
-    logger = logging.getLogger(unique_id)
-    return logger
-
-def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
-    # 模拟返回处理后的数据
-    return {
-        'knowledge_name': 'KnowledgeName_' + unique_id,
-        'truncate0': 'truncate0',
-        'truncate1': 'truncate1',
-        'truncate3': 'truncate3',
-        'truncate0_jsonpath': 'truncate0_jsonpath',
-        'clause_path': 'clause_path',
-        'invalid_docpath': 'invalid_docpath',
-        'input_file_path': 'input_file_path',
-        'output_folder': output_folder
-    }
-
-def deleteKnowledge(knowledge_index):
-    # 模拟删除操作
-    pass
-
-def main_process(output_folder, downloaded_file_path, file_type, unique_id):
-    global logger
-    logger = get_global_logger(unique_id)
-
-    # 预处理文件，获取处理后的数据
-    processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
-    if not processed_data:
-        yield json.dumps({})  # 如果处理数据失败，返回空的 JSON
-
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures = {
-            'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'],
-                                         processed_data['truncate0'], output_folder,
-                                         processed_data['clause_path']),
-            'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
-                                                    processed_data['truncate3'],
-                                                    processed_data['knowledge_name'],
-                                                    processed_data['truncate0_jsonpath'],
-                                                    processed_data['clause_path'], processed_data['input_file_path'],
-                                                    processed_data['output_folder']),
-            'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
-            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
-                                                    output_folder, processed_data['truncate0_jsonpath'],
-                                                    processed_data['clause_path'], processed_data['truncate3']),
-            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
-                                                              processed_data['clause_path']),
-            'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
-        }
-
-        # 按照任务完成的顺序返回结果
-        for future in concurrent.futures.as_completed(futures.values()):
-            key = next(k for k, v in futures.items() if v == future)
-            try:
-                result = future.result()
-
-                # 记录哪个任务完成了
-                logger.info(f"Task {key} completed.")
-
-                # 处理 evaluation_standards 返回的技术标和商务标分别作为两个结果
-                if key == 'evaluation_standards':
-                    technical_standards = result["technical_standards"]
-                    commercial_standards = result["commercial_standards"]
-
-                    # 分别发送技术标和商务标
-                    modified_technical_result = transform_json_values({"technical_standards": technical_standards})
-                    modified_commercial_result = transform_json_values({"commercial_standards": commercial_standards})
-
-                    yield json.dumps(modified_technical_result, ensure_ascii=False)  # 直接返回 JSON 数据
-                    yield json.dumps(modified_commercial_result, ensure_ascii=False)  # 直接返回 JSON 数据
-
-                    sys.stdout.flush()  # 强制刷新流式数据
-
-                else:
-                    modified_result = transform_json_values({key: result})
-                    yield json.dumps(modified_result, ensure_ascii=False)  # 直接返回 JSON 数据
-
-                    sys.stdout.flush()  # 强制刷新流式数据
-
-            except Exception as exc:
-                logger.error(f"Error processing {key}: {exc}")
-                yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
-                sys.stdout.flush()
-
-    # 删除知识索引
-    deleteKnowledge(processed_data['knowledge_name'])
-
-
-# 调用主函数，模拟执行
+from flask_app.main.json_utils import extract_content_from_json
+from flask_app.main.多线程提问 import multi_threading
+from flask_app.货物标.资格审查main import update_json_data
+ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的？请按json格式给我提供信息，键名为'资格性审查.资格要求'，你不能回答而键值需要完全与原文保持一致，不要擅自总结、删减，如果存在未知信息，请在对应键值处填'未知'。"]
+knowledge_name="6.2视频会议docx"
+results = multi_threading(ques, knowledge_name)  # 无序号的直接问大模型
+first_response_list = []
+for _, response in results:
+        try:
+            if response and len(response) > 1:  # 检查response存在且有至少两个元素
+                temp = extract_content_from_json(response[1])
+                first_response_list.append(temp)
+            else:
+                print(f"形式响应评审：Warning: Missing or incomplete response data for query index {_}.")
+        except Exception as e:
+            print(f"形式响应评审：Error processing response for query index {_}: {e}")
+print(first_response_list)
--- a/flask_app/main/形式响应评审.py
+++ b/flask_app/main/形式响应评审.py
@ -137,7 +137,9 @@ def reformat_questions(match_keys, input_path, output_folder):
    # 初始化文件处理
    generated_files = None
    file_processed = False
-
+    question_template = (
+        "关于'{key}'，{revised_value}的内容是怎样的？请按json格式给我提供信息，键名为'{key}'，而键值需要完全与原文保持一致，不要擅自总结、删减，如果存在未知信息，请在对应键值处填'未知'。"
+    )
    for entry in match_keys:
        key, value = next(iter(entry.items()))
        match = pattern.search(value)
@ -156,14 +158,14 @@ def reformat_questions(match_keys, input_path, output_folder):
                entries_with_numbers2.append({key: num})
            else:
                revised_value = value.replace('符合', '')
-                formatted_entry = f"关于'{key}'，{revised_value}的内容是怎样的？请按json格式给我提供信息，键名为'{key}',而键值需要完全与原文保持一致，不要擅自总结、删减，如果存在未知信息，请在对应键值处填'未知'。"
+                formatted_entry = question_template.format(key=key, revised_value=revised_value)
                formatted_questions.append(formatted_entry)
        elif match:
            num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
            entries_with_numbers.append({key: num})
        else:
            revised_value = value.replace('符合', '')
-            formatted_entry = f"关于'{key}'，{revised_value}的内容是怎样的？请按json格式给我提供信息，键名为'{key}',而键值需要完全与原文保持一致，不要擅自总结、删减，如果存在未知信息，请在对应键值处填'未知'。"
+            formatted_entry = question_template.format(key=key, revised_value=revised_value)
            formatted_questions.append(formatted_entry)

    return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -265,7 +265,7 @@ def truncate_pdf_multiple(input_path, output_folder):
 if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
-    input_path="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf"
+    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest17.pdf"
    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp"
    truncate_pdf_multiple(input_path,output_folder)
    # selection = 3 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@ -292,9 +292,6 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
    # 删除知识索引
    deleteKnowledge(index)

-
-
-#TODO:流式输出要改商务标技术标整合
 if __name__ == "__main__":
    output_folder = "flask_app/static/output/zytest1"

--- a/flask_app/main/文件分类普通版.py
+++ b/flask_app/main/文件分类普通版.py
@ -340,7 +340,7 @@ def main(base_directory, intermediate_directory, final_directory):
    os.makedirs(final_directory, exist_ok=True)
    process_directory(base_directory, intermediate_directory, final_directory)

-#TODO：后期可加多线程进行处理
+
 if __name__ == "__main__":
    base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
    intermediate_directory = 'D:\\tmp'
--- a/flask_app/main/禁止投标情形.py
+++ b/flask_app/main/禁止投标情形.py
@ -178,7 +178,6 @@ def find_forbidden(truncate_json_path,clause_path,truncate3):    #投标人须
    forbidden_dict={'不得存在的其他情形':merged_forbidden_list}

    return forbidden_dict
-
 #TODO:不得存在的情况文中有很多内容
 if __name__ == '__main__':
    truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
--- a/flask_app/货物标/资格审查main.py
+++ b/flask_app/货物标/资格审查main.py
@ -7,6 +7,7 @@ from flask_app.main.通义千问long import qianwen_long,upload_file
 from flask_app.main.多线程提问 import multi_threading
 from flask_app.main.json_utils import combine_json_results,clean_json_string
 from flask_app.main.形式响应评审 import update_json_data,extract_matching_keys
+from flask_app.main.json_utils import extract_content_from_json
 #这个字典可能有嵌套，你需要遍历里面的键名，对键名作判断，而不是键值，具体是这样的：如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除，重新组织成一个字典格式的数据，你可以考虑用字符串列表来保持部分平级的数据
 #对于同级的键，如果数量>1且键名都统一，那么将键名去掉，用列表保持它们的键值
 #对于同一个字典中，可能存在若干键值对，若它们的键值都是""或者"/" 你就将它们的键值删去，它们的键名用字符串列表保存
@ -102,6 +103,7 @@ def process_dict(data):


 def find_chapter_clause_references(data, parent_key=""):
+    exclude_list=["格式要求"]
    result = []
    # 正则匹配"第x章"或"第x款"
    chapter_clause_pattern = re.compile(r'第[一二三四五六七八九十\d]+[章款]')
@ -111,6 +113,11 @@ def find_chapter_clause_references(data, parent_key=""):
        # 生成当前的完整键名
        full_key = f"{parent_key}.{key}" if parent_key else key

+        # 检查是否应排除该键或值
+        if any(exclude_item in full_key for exclude_item in exclude_list) or \
+           (isinstance(value, str) and any(exclude_item in value for exclude_item in exclude_list)):
+            continue  # 如果在排除项中，跳过处理
+
        if isinstance(value, dict):
            # 如果值是字典，递归调用函数
            result.extend(find_chapter_clause_references(value, full_key))
@ -121,25 +128,106 @@ def find_chapter_clause_references(data, parent_key=""):

    return result

-# def reformat_questions(match_keys):
+def preprocess_value(value):
+    # 使用正则表达式查找"第X章"或"第X款"
+    chapter_match = re.search(r'第(.+?)章', value)
+    clause_match = re.search(r'第(.+?)款', value)

-def qualification_review(truncate_file):
+    if chapter_match or clause_match:
+        # 以逗号、句号、问号、感叹号为分隔符
+        separators = r'[，。？！,\?!]'
+
+        # 分隔符检测函数，确保括号成对闭合时才用作分隔符
+        def is_separator(ch, count):
+            return count['('] == count[')'] and count['（'] == count['）'] and re.match(separators, ch)
+
+        parts = []
+        current_part = []
+        count = {'(': 0, ')': 0, '（': 0, '）': 0}
+
+        for ch in value:
+            if ch in count:
+                count[ch] += 1
+            if is_separator(ch, count):
+                parts.append("".join(current_part).strip())
+                current_part = []
+            else:
+                current_part.append(ch)
+
+        if current_part:
+            parts.append("".join(current_part).strip())
+
+        # 查找包含章节或条款的部分
+        target_part = next((part for part in parts if '章' in part or '款' in part), None)
+
+        if target_part:
+            # 删除开头的"符合"或"应满足"
+            target_part = re.sub(r'^(符合|应满足)\s*', '', target_part.strip())
+            return target_part
+
+    # 如果没有找到特定章节或条款，返回原始值
+    return value
+
+
+def generate_questions(input_list):
+    template = (
+        "关于'{key}',{value}的内容是怎样的？请按json格式给我提供信息，键名为'{key}'，而键值需要完全与原文保持一致，不要擅自总结、删减，如果存在未知信息，请在对应键值处填'未知'。"
+    )
+
+    questions = []
+    for input_dict in input_list:
+        for key, value in input_dict.items():
+            processed_value = preprocess_value(value)
+            question = template.format(key=key, value=processed_value)
+            questions.append(question)
+    return questions
+
+def update_json_data(original_data,response_list):
+    def recursive_update(data, key, value):
+        # 处理点分隔的键，递归定位并更新嵌套字典
+        keys = key.split('.')
+        for k in keys[:-1]:
+            data = data.setdefault(k, {})
+        if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
+            data[keys[-1]] = {**data.get(keys[-1], {}), **value}
+        else:
+            data[keys[-1]] = value
+    for response_dict in response_list:
+        for key, value in response_dict.items():
+            recursive_update(original_data, key, value)
+    return original_data
+
+def qualification_review(truncate_file,knowledge_name):
    file_id=upload_file(truncate_file)
    user_query=["该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，你的回答要与原文一致，不可擅自总结删减，也不要回答有关符合性性审查的内容。","该招标文件中规定的符合性审查标准是怎样的？请以json格式给出，外层为'符合性审查'，你的回答要与原文一致，不可擅自总结删减，也不要回答有关资格性审查的内容。"]
    results=multi_threading(user_query,"",file_id,2)
    combined_res = {}
    for question, response in results:
-        print(response)
        cleaned_data = clean_json_string(response)
        processed1 = preprocess_dict(cleaned_data)
        processed2 = process_dict(processed1)
        combined_res.update(processed2)
-    result=find_chapter_clause_references(combined_res)
-    print(result)
+    match_keys=find_chapter_clause_references(combined_res)
+    ques=generate_questions(match_keys)
+    results = multi_threading(ques, knowledge_name)  # 无序号的直接问大模型
+    first_response_list = []
+    for _, response in results:
+        try:
+            if response and len(response) > 1:  # 检查response存在且有至少两个元素
+                temp = extract_content_from_json(response[1])
+                first_response_list.append(temp)
+            else:
+                print(f"形式响应评审：Warning: Missing or incomplete response data for query index {_}.")
+        except Exception as e:
+            print(f"形式响应评审：Error processing response for query index {_}: {e}")
+    # print(first_response_list)
+    final_result=update_json_data(combined_res,first_response_list)
+    return final_result
 # 整合基础信息核心代码
-    return combined_res
 #[{'资格性审查.资格要求': '符合本采购文件第一章第二款要求，并提供合格有效的证明材料'}, {'资格性审查.没有重大违法记录的书面声明': '是否提交参加政府采购活动前三年内在经营活动中没有重大违法记录的书面承诺或声明（格式要求详见本项目采购文件第六章相关格式要求）'}]
+#TODO:有个严重的问题，对于{'资格性审查.资格要求': '符合本采购文件第一章第二款要求，并提供合格有效的证明材料'}，调用百炼rag的时候容易得到一模一样的回答，而不是跳转到具体的地方，有两个思路，1.结构化第一章内容 2.优化提示词 3.构造问题的时候不带value，直接问key
 if __name__ == "__main__":
    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
-    res=qualification_review(truncate_file)
+    knowledge_name="6.2视频会议docx"
+    res=qualification_review(truncate_file,knowledge_name)
    print(json.dumps(res,ensure_ascii=False, indent=4))