11.6修复bug

2024-11-06 17:29:45 +08:00 · 2024-11-06 17:29:45 +08:00 · 21eeb87903
commit 21eeb87903
parent c0ba74a374
5 changed files with 134 additions and 72 deletions
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -34,7 +34,7 @@ def preprocess_data(data):
 # 转换结构化的JSON数据
-#No parent found at level 1 for key '24.2'. Check the data structure.
+#生成结构化的数据
 def transform_json(data):
    result = {}
    temp = {0: result}  # 初始化根字典
@ -183,19 +183,35 @@ def process_nested_data(data):
        # 到达最内层，处理非字典和非列表的元素（字符串）
        return post_process(data)
-def get_requirements_with_gpt(invalid_path, selection):
+#生成无结构的数据
 def concatenate_keys_values(section_content):
    """
    将章节内容的键值对拼接成一个字符串列表，每个元素为 "key value"。
    Args:
        section_content (dict): 章节内容的键值对。
    Returns:
        list of str: 拼接后的字符串列表。
    """
    concatenated = []
    for key, value in section_content.items():
        concatenated.append(f"{key} {value}")
    return concatenated
 def get_requirements_with_gpt(merged_baseinfo_path, selection):
    """
    根据 selection 的值选择相应的用户查询，并调用大模型获取要求。
    Args:
-        invalid_path (str): 无效文件的路径，用于上传。
+        merged_baseinfo_path (str): 无效文件的路径，用于上传。
        selection (int): 选择的类型（1、2 或 3）。
    Returns:
        dict: 大模型返回的要求结果，或错误信息。
    """
    # 上传文件并获取 file_id
-    file_id = upload_file(invalid_path)
+    file_id = upload_file(merged_baseinfo_path)
    # 定义 selection 对应的用户查询
    user_queries = {
        # 1: """
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -95,8 +95,8 @@ def extract_text_by_page(file_path):
 if __name__ == '__main__':
-    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
+    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
+    file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\54d7aa40-8b36-4803-b6f7-278356ff1381\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
--- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
+++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
@ -2,7 +2,7 @@ import json
 import re
 from functools import cmp_to_key
-from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt
+from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt,concatenate_keys_values
 #提取两个大标题之间的内容
@ -103,19 +103,26 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
        # 提取目标部分
        extracted_data = extract_between_sections(data, target_values)  # 读取json，截取大标题之间的内容
-        transformed_data = process_with_outer_key(extracted_data)
+        if not extracted_data:
        final_result = process_nested_data(transformed_data)
        if not final_result:
            final_result = get_requirements_with_gpt(merged_baseinfo_path, type)    #万一没用正则匹配到，那就调用大模型
-        return final_result
+            return final_result
        # print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
        extracted_data_concatenated = {section: concatenate_keys_values(content)
                                       for section, content in extracted_data.items()}
        return extracted_data_concatenated
        # transformed_data = process_with_outer_key(extracted_data)
        # final_result = process_nested_data(transformed_data)
        # return final_result
    except Exception as e:
        print(f"Error in extract_from_notice: {e}")
        return DEFAULT_RESULT
 if __name__ == "__main__":
-    clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json'
+    clause_path = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\clause1.json'
-    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
+    merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_merged_baseinfo.pdf"
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
    try:
        res = extract_from_notice(merged_baseinfo_path,clause_path, 1)  # 可以改变此处的 type 参数测试不同的场景
--- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
@ -105,12 +105,14 @@ def should_add_newline(content, keywords, max_length=20):
    content_str = ''.join(content).strip()
    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
-def handle_content_append(current_content, line_content, append_newline, keywords):
+def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
    if append_newline:
        if should_add_newline(current_content, keywords):
            current_content.append('\n')  # 添加换行符
        append_newline = False
    current_content.append(line_content)
    if in_special_section:
        current_content.append('\n')
    return append_newline
 """
@ -134,7 +136,11 @@ def parse_text_by_heading(text):
    skip_subheadings = False
    last_main_number = None
    temp_title = None  # 临时存储以点号开头但不带数字的标题
-
+    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
    initial_heading_pattern = None
    special_section_keywords = ['文件的组成', '文件的构成']  # 定义特殊章节关键词
    in_special_section = False  # 标志是否在特殊章节中
    lines = text.split('\n')
    def check_year_pattern(line):
@ -150,11 +156,22 @@ def parse_text_by_heading(text):
            current_content.append(line_stripped)
            continue
        # **首先检查是否进入特殊章节**
        if any(keyword in line_stripped for keyword in special_section_keywords):
            in_special_section = True
        # 匹配带数字的标题，例如 '12.1 内容'
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
        # 检查是否退出特殊章节
        if in_special_section:
            # 如果匹配到了主要标题，且不包含特殊关键词，则退出特殊章节
            if match and not any(keyword in line_stripped for keyword in special_section_keywords):
                in_special_section = False
        # 以下是原有的匹配逻辑
        # 匹配以点号开头并带有数字的情况，例如 '.12.1 内容'
        dot_match = re.match(r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$', line_stripped)
@ -197,7 +214,7 @@ def parse_text_by_heading(text):
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                last_main_number = new_key.split('.')[0]
            else:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
        elif dot_match:
            # 处理以点号开头并带有数字的情况
@ -212,14 +229,15 @@ def parse_text_by_heading(text):
                current_content = [line_content]
                append_newline = True
            else:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
        elif dot_text_match:
            # 处理以点号开头但不带数字的情况，存储到临时变量
            temp_title = dot_text_match.group(1).strip()
            continue  # 跳过进一步处理该行
-        elif pure_number_match:  # 处理不带点号的纯数字开头的情况，例如 '27xxxxx'
+        elif pure_number_match:
            # 处理不带点号的纯数字开头的情况
            new_key_candidate, line_content = pure_number_match.groups()
            new_key_candidate += '.'  # 添加点号
            line_content = line_content.lstrip('.．、,')
@ -247,69 +265,86 @@ def parse_text_by_heading(text):
                current_content = [line_content]
                append_newline = True
                last_main_number = new_key_candidate.rstrip('.')
                # if current_key is None or (current_key != new_key and (     #不给序号排序
                #         len(current_content) == 0 or current_content[-1][-1] != '第')):
                #     if current_key is not None:
                #         content_string = ''.join(current_content).strip()
                #         data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
                #     current_key = new_key
                #     current_content = [line_content]
                #     append_newline = len(new_key.rstrip('.').split('.')) <= 2
                #     last_main_number = new_key.split('.')[0]
            else:
                # 将当前行视为当前标题的内容
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
        else:
-            if not skip_subheadings:   # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑
+            if not skip_subheadings and not in_special_section:  # 增加 in_special_section 判断
-                pattern_title = re.compile(
+                numbered_match = pattern_numbered.match(line_stripped)
-                    r'^\s*(?:[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
+                parenthesis_match = pattern_parentheses.match(line_stripped)
                chinese_match = pattern_title.match(line_stripped)
                letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
                arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
-                # 优化处理逻辑，减少冗余
+                # 判断当前行是否匹配了任何标题模式
-                if chinese_match or letter_match or arabic_match:
+                if numbered_match or parenthesis_match or letter_match or arabic_match:
-                    # 保存之前的 key 的内容（无论是中文标题还是阿拉伯数字标题）
+                    # 如果初始标题模式尚未设置，则记录当前匹配的标题模式
-                    if current_key is not None:
+                    if initial_heading_pattern is None:
-                        content_string = ''.join(current_content).strip()
+                        if numbered_match:
-                        data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+                            initial_heading_pattern = 'numbered'
-                    if current_key_chinese is not None:
+                        elif parenthesis_match:
-                        data[current_key_chinese] = current_value_chinese
+                            initial_heading_pattern = 'parentheses'
-                        current_key_chinese = None
+                        elif letter_match:
                            initial_heading_pattern = 'letter'
                        elif arabic_match:
                            initial_heading_pattern = 'arabic'
-                    # 处理中文标题
+                    # 确定当前匹配的标题模式
-                    if chinese_match:
+                    if numbered_match:
-                        current_key_chinese = chinese_match.group(1) or chinese_match.group(2)
+                        current_heading_pattern = 'numbered'
-                        current_value_chinese = line_stripped[chinese_match.end():].lstrip('.．、,').replace(' ', '')
+                    elif parenthesis_match:
-                        if current_key_chinese in data:
+                        current_heading_pattern = 'parentheses'
                            handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
                            current_key_chinese = None
                    # 处理字母标题
                    elif letter_match:
-                        letter_key, letter_value = letter_match.groups()
+                        current_heading_pattern = 'letter'
                        letter_to_chinese = {
                            'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
                            'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
                            'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
                        }
                        current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
                        current_value_chinese = letter_value.lstrip('.．、,').replace(' ', '')
                        if current_key_chinese in data:
                            handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
                            current_key_chinese = None
                    # 处理阿拉伯数字标题
                    elif arabic_match:
-                        arabic_key, arabic_value = arabic_match.groups()
+                        current_heading_pattern = 'arabic'
                        current_key = arabic_key.replace('、', '.')
                        current_content = [arabic_value]
                        append_newline = True
                        last_main_number = current_key.rstrip('.')
                    continue
-            # 如果没有匹配标题，继续处理正文内容
+                    # 如果当前标题模式与初始标题模式一致，创建新的键值对
-            if line_stripped:
+                    if current_heading_pattern == initial_heading_pattern:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+                        # 保存之前的 key 的内容
                        if current_key is not None:
                            content_string = ''.join(current_content).strip()
                            data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
                        if current_key_chinese is not None:
                            data[current_key_chinese] = current_value_chinese
                            current_key_chinese = None
                        # 处理匹配到的标题
                        if current_heading_pattern == 'numbered':
                            current_key_chinese = numbered_match.group(1)
                            current_value_chinese = line_stripped[numbered_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'parentheses':
                            current_key_chinese = parenthesis_match.group(1)
                            current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'letter':
                            # 字母标题处理
                            letter_key, letter_value = letter_match.groups()
                            letter_to_chinese = {
                                'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
                                'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
                                'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
                            }
                            current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
                            current_value_chinese = letter_value.lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'arabic':
                            arabic_key, arabic_value = arabic_match.groups()
                            current_key = arabic_key.replace('、', '.')
                            current_content = [arabic_value]
                            append_newline = True
                            last_main_number = current_key.rstrip('.')
                        continue
                    else:
                        # 当前标题模式与初始模式不一致，将该行视为内容
                        if line_stripped:
                            append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
                else:
                    # 未匹配到任何标题模式，将该行视为内容
                    if line_stripped:
                        append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
            else:
                # 在特殊章节中，所有内容都作为当前标题的内容
                if line_stripped:
                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
    # 最后保存最后一个 key 对应的内容
    if current_key is not None:
@ -448,11 +483,12 @@ def process_folder(input_folder, output_folder):
 #TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
 #TODO: 投标人须知正文这块，序号可能是乱序的，或许可以删除判断序号大小的逻辑，只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
 #TODO:11.6 目前 '文件的组成' 是匹配任意行,可以考虑只匹配'11.文件的组成' 前面有序号的行    a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf还有问题
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
+    file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
-    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp'
+    output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder,1)
        print(f"Final JSON result saved to: {output_path}")
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -243,6 +243,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
 #广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
 #TODO: 目前跳转可能有个问题,资格审查那边:既有原来的内容又有跳转后的内容;符合本采购文件第一章第二款要求，并提供合格有效的证明材料<br>1、满足《中华人民共和国政府采购法》第二十二条规定，即：<br>（1）具有独立承担
 #TODO:资格审查默认加上第一章的内容, 资格审查章节不做跳转逻辑.
 #TODO:开评定标那块,不做层次遍历  ing
 #TODO:技术要求提取更全面一点  fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628
 #good_list 金额  截取上下文
 if __name__ == "__main__":
    # 配置日志器