11.17 修复了提取货物标的Bug，无效投标bug修复

2024-11-17 17:27:05 +08:00 · 2024-11-17 17:27:05 +08:00 · c93261207d
commit c93261207d
parent b94c62b7e6
14 changed files with 33 additions and 153 deletions
--- a/flask_app/general/table_content_extraction.py
+++ b/flask_app/general/table_content_extraction.py
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -185,7 +185,7 @@ def process_nested_data(data):

 #生成无结构的数据货物标
 def concatenate_keys_values(section_content):
-    print(json.dumps(section_content, ensure_ascii=False, indent=4))
+    # print(json.dumps(section_content, ensure_ascii=False, indent=4))
    """
    将章节内容的键值对拼接成一个字符串列表，每个元素为 "key value"。

--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -3,7 +3,7 @@ import os
 import re
 import time
 from concurrent.futures import ThreadPoolExecutor
-
+from flask_app.general.doubao import doubao_model, generate_full_user_query
 from docx import Document

 from flask_app.general.通义千问long import upload_file, qianwen_long_text
@ -39,7 +39,7 @@ def read_docx_last_column(truncate_file):

 # 完整读取文件中所有表格（适合pdf转docx价格便宜的情况，优先推荐，内容完整）
 def read_tables_from_docx(file_path):
-    print(file_path)
+    # print(file_path)
    # 尝试打开文档
    try:
        doc = Document(file_path)
@ -282,15 +282,13 @@ def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达

    for key, text_list in extracted_contents.items():
        if len(text_list) == 1:
-            # print(text_list)
-            # print("------------------")
            for data in text_list:
                # print(data)
                # 检查是否包含任何需要排除的字符串
                if any(exclude in data for exclude in excludes):
                    continue  # 如果包含任何排除字符串，跳过这个数据
                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
-                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+                pattern = r'^\s*([（(]\d+[)）)]|[A-Za-z]?\d+\s*(\.\s*\d+)*[\s\.、．)\）]|[一二三四五六七八九十]+、)'
                data = re.sub(pattern, '', data).strip()
                keyword_match = re.search(keywords, data)
                if keyword_match:
@ -320,7 +318,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达
            # print("*********")
            new_text_list = preprocess_text_list(text_list)
            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
-            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+            pattern = r'^\s*([（(]\d+[)）)]|[A-Za-z]?\d+\s*(\.\s*\d+)*[\s\.、．)\）]|[一二三四五六七八九十]+、)'

            data = re.sub(pattern, '', new_text_list[0]).strip()  # 去除序号
            # 将修改后的第一个元素和剩余的元素连接起来
@ -373,7 +371,7 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords):
        split_sentences = re.split(
            r'(?<=[。！？!?\?；])|'  # 在中文句号、感叹号、问号或分号后面分割
            r'(?=\d+[.．]\d+)|'  # 在类似1.1的数字序号前分割
-            r'(?=\d+\s(?![号条款节章项例页段部步点年月日时分秒]))|'  # 数字后面跟空格且空格后面不是指定关键字时分割
+            r'(?=\d+\s(?![号条款节章项例页段部步点年月日时分秒个]))|'  # 数字后面跟空格且空格后面不是指定关键字时分割
            r'(?=\d+[、.．])|'  # 在数字后直接跟顿号、半角点号或全角点号时分割
            r'(?=[A-Za-z][.．]\s*)|'  # 在字母加点（如A.、a.）前分割
            r'(?=[A-Za-z]+\s*\d+\s*(?:[.．]\s*\d+)*)|'  # 在可选字母加数字或多级编号前分割
@ -478,7 +476,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
        qianwen_txt = all_texts1 + all_tables1
        # Proceed only if there is content to write
        selected_contents = set()  # 使用 set 去重
-
        if qianwen_txt:
            with open(output_file, 'w', encoding='utf-8') as file:
                counter = 1
@ -486,11 +483,12 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
                    file.write(f"{counter}. {content}\n")
                    file.write("..............." + '\n')
                    counter += 1
-
-            file_id = upload_file(output_file)
-            # qianwen_ans = qianwen_long(file_id, user_query)
-            qianwen_ans = qianwen_long_text(file_id, user_query)
-            num_list = process_string_list(qianwen_ans)
+            user_query = generate_full_user_query(output_file, user_query)
+            model_ans=doubao_model(user_query)   #豆包
+            # file_id = upload_file(output_file)
+            # model_ans = qianwen_long(file_id, user_query)
+            # model_ans = qianwen_long_text(file_id, user_query)
+            num_list = process_string_list(model_ans)
            print(result_key + "选中的序号:" + str(num_list))

            for index in num_list:
@ -525,19 +523,25 @@ def combine_find_invalid(file_path, output_dir):
    queries = [
        (
            r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|无\s*效\s*标|视\s*为\s*无\s*效|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
-            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
+            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。
+            文本内容：{full_text}
+            """,
            os.path.join(output_dir, "temp1.txt"),
            "否决和无效投标情形"
        ),
        (
            r'废\s*标',
-            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
+            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。
+            文本内容：{full_text}
+            """,
            os.path.join(output_dir, "temp2.txt"),
            "废标项"
        ),
        (
            r'不\s*得|禁\s*止\s*投\s*标',
-            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，每条信息规定了各方不得存在的情形，请回答：在这些信息中，投标人或中标人或供应商或联合体投标各方或磋商小组不得存在的情形或禁止投标的情形有哪些？不要返回主语是招标人或采购人或评标委员会的信息，请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，示例返回为[1,4,6]，若情形不存在，返回[]。以下为需要考虑的注意事项：请返回包含实际内容的信息，若信息内容诸如'投标人不得存在的其他关联情形'这样笼统的表格，而未说明具体的情形，则无需添加这条信息。",
+            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，每条信息规定了各方不得存在的情形，请回答：在这些信息中，投标人或中标人或供应商或联合体投标各方或磋商小组不得存在的情形或禁止投标的情形有哪些？不要返回主语是招标人或采购人或评标委员会的信息，请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，示例返回为[1,4,6]，若情形不存在，返回[]。以下为需要考虑的注意事项：请返回包含实际内容的信息，若信息内容诸如'投标人不得存在的其他关联情形'这样笼统的表格，而未说明具体的情形，则无需添加这条信息。
+            文本内容：{full_text}
+            """,
            os.path.join(output_dir, "temp3.txt"),
            "不得存在的情形"
        )
@ -581,8 +585,8 @@ if __name__ == '__main__':
    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件（实高电子显示屏）_tobidders_notice_part1.docx"
    # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
    # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
-    doc_path = r'D:\flask_project\flask_app\static\output\output1\6bf9792c-49a0-4a6a-8203-14b001f87911\ztbfile_invalid.docx'
-    output_dir = r"D:\flask_project\flask_app\static\output\output1\6bf9792c-49a0-4a6a-8203-14b001f87911\tmp"
+    doc_path = r'D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\ztbfile.docx'
+    output_dir = r"D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\tmp"
    results = combine_find_invalid(doc_path, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
--- a/flask_app/general/读取文件/读取docx.py
+++ b/flask_app/general/读取文件/读取docx.py
@ -90,7 +90,7 @@ def read_docx_by_paragraphs(file_path):
        return []

 if __name__ == "__main__":
-    file_path =  r'D:\flask_project\flask_app\static\output\output1\6bf9792c-49a0-4a6a-8203-14b001f87911\ztbfile_invalid.docx'
+    file_path =  r'D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\ztbfile.docx'
    read_docx(file_path)  #按行读取

    # paragraphs = read_docx_by_paragraphs(file_path)   #按段落读取
--- a/flask_app/main/工程标解析main.py
+++ b/flask_app/main/工程标解析main.py
@ -5,10 +5,8 @@ import os
 import time
 from concurrent.futures import ThreadPoolExecutor

-from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt
 from flask_app.main.截取pdf import truncate_pdf_multiple
 from flask_app.general.merge_pdfs import merge_pdfs
-from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.general.无效标和废标公共代码 import combine_find_invalid
@ -18,9 +16,7 @@ from flask_app.main.基础信息整合快速版 import combine_basic_info
 from flask_app.main.资格审查模块 import combine_review_standards
 from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
 from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
-from flask_app.general.docx截取docx import copy_docx
-from flask_app.general.通义千问long import upload_file,qianwen_long
-from flask_app.general.json_utils import clean_json_string
+

 def get_global_logger(unique_id):
    if unique_id is None:
--- a/flask_app/main/资格审查模块.py
+++ b/flask_app/main/资格审查模块.py
@ -4,7 +4,7 @@ import time

 from flask_app.general.format_change import pdf2docx
 from flask_app.general.json_utils import extract_content_from_json, clean_json_string
-from flask_app.main.table_content_extraction import extract_tables_main
+from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.main.形式响应评审 import process_reviews
 from flask_app.main.资格评审 import process_qualification
 from flask_app.general.通义千问long import upload_file, qianwen_long
--- a/flask_app/main/资格评审new.py
+++ b/flask_app/main/资格评审new.py
@ -1,120 +0,0 @@
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from flask_app.general.通义千问long import upload_file, qianwen_long
-from flask_app.general.json_utils import clean_json_string
-def combine_qualification_new(invalid_path, qualification_path,notice_path):
-    detailed_res = {}
-    # 初始化无效文件ID
-    invalid_file_id = None
-    if qualification_path:
-        # 上传资格文件并获取文件ID
-        qualification_file_id = upload_file(qualification_path)
-
-        # 定义第一个查询，用于检查资格性审查是否存在
-        first_query = """
-            该文档中是否有关于资格性审查标准的具体内容?请以json格式给出回答,外键为'资格性审查',键值仅限于'是','否',输出格式示例如下:
-            {
-                "资格性审查":"是"
-            }
-            """
-
-        # 执行第一个查询并清洗返回的JSON字符串
-        print("call first_query")
-        first_res = clean_json_string(qianwen_long(qualification_file_id, first_query))
-        # 判断是否存在资格性审查
-        zige_file_id = qualification_file_id if first_res.get("资格性审查") == "是" else None
-
-        # 如果需要，上传无效文件
-        if zige_file_id is None:
-            if invalid_file_id is None:
-                invalid_file_id = upload_file(invalid_path)
-            zige_file_id = invalid_file_id
-
-    else:
-        # 如果 qualification_path 为空，直接使用无效文件
-        zige_file_id = upload_file(invalid_path)
-
-    # 定义第二组查询，仅包含资格性审查
-    second_query = [
-        {
-            "key": "资格性审查",
-            "query": "该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，你的回答要与原文完全一致，不可擅自总结删减，也不要回答有关符合性审查的内容。"
-        }
-    ]
-
-    # 定义任务函数
-    def process_second_query(key, query, file_id):
-        print("call second_query")
-        try:
-            res = qianwen_long(file_id, query)
-            cleaned_res = clean_json_string(res)
-            return key, cleaned_res.get(key, "未找到相关内容")
-        except Exception as e:
-            print(f"执行查询 '{key}' 时出错: {e}")
-            return key, "查询失败"
-
-    def process_notice(notice_path):
-        print("call notice_path")
-        try:
-            # 上传通知文件并获取文件ID
-            file_id1 = upload_file(notice_path)
-
-            # 定义用户查询，提取申请人资格要求
-            user_query1 = """
-                第一章招标公告（投标邀请书）中说明的申请人资格要求是怎样的？请以json格式给出回答，外键为'申请人资格要求'，键值为字符串列表，其中每个字符串对应原文中的一条要求，你的回答与原文内容一致，不要擅自总结删减。输出格式示例如下：
-                {
-                    "申请人资格要求":[
-                        "1.满足《中华人民共和国政府采购法》第二十二条规定；",
-                        "1.1 法人或者其他组织的营业执照等证明文件，如供应商是自然人的提供身份证明材料；",
-                        "2.未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、重大税收违法案件当事人名单；"
-                    ]
-                }
-                """
-
-            # 执行查询并清洗结果
-            res1 = clean_json_string(qianwen_long(file_id1, user_query1))
-
-            # 提取申请人资格要求
-            requirements = res1.get("申请人资格要求", "未找到相关内容")
-            return "申请人资格要求", requirements
-        except Exception as e:
-            print(f"处理申请人资格要求时出错: {e}")
-            return "申请人资格要求", "处理失败"
-
-    # 初始化 ThreadPoolExecutor
-    with ThreadPoolExecutor(max_workers=2) as executor:
-        future_to_key = {}
-
-        # 提交第二组查询
-        for query_info in second_query:
-            key = query_info["key"]
-            query = query_info["query"]
-            current_file_id = zige_file_id
-            future = executor.submit(process_second_query, key, query, current_file_id)
-            future_to_key[future] = key
-
-        # 有条件地提交通知处理
-        if notice_path:
-            future = executor.submit(process_notice, notice_path)
-            future_to_key[future] = "申请人资格要求"
-        else:
-            future = executor.submit(process_notice, invalid_path)
-            future_to_key[future] = "申请人资格要求"
-
-        # 收集结果（按完成顺序）
-        for future in as_completed(future_to_key):
-            key, result = future.result()
-            detailed_res[key] = result
-
-    # 定义所需的顺序
-    desired_order = ["申请人资格要求", "资格性审查"]
-    # print(json.dumps(detailed_res,ensure_ascii=False,indent=4))
-    # 创建一个新的有序字典
-    ordered_res = {}
-    for key in desired_order:
-        if key in detailed_res:
-            ordered_res[key] = detailed_res[key]
-
-    # 最终处理结果，例如打印或保存
-    return {"资格审查": ordered_res}
-
-
--- a/flask_app/old_version/截取pdf_old.py
+++ b/flask_app/old_version/截取pdf_old.py
--- a/flask_app/old_version/招标文件解析.py
+++ b/flask_app/old_version/招标文件解析.py
@ -4,7 +4,7 @@ import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.main.截取pdf import truncate_pdf_multiple
-from flask_app.main.table_content_extraction import extract_tables_main
+from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.old_version.文档理解大模型版知识库处理.知识库操作 import addfileToKnowledge, deleteKnowledge
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
--- a/flask_app/old_version/无效标和废标和禁止投标整合.py
+++ b/flask_app/old_version/无效标和废标和禁止投标整合.py
@ -8,7 +8,7 @@ from flask_app.general.format_change import pdf2docx
 from flask_app.general.通义千问long import upload_file, qianwen_long_text
 from concurrent.futures import ThreadPoolExecutor

-from flask_app.main.table_content_extraction import extract_tables_main
+from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.old_version.不得存在及禁止投标情形 import find_forbidden, process_string_list


--- a/flask_app/old_version/解析old.py
+++ b/flask_app/old_version/解析old.py
@ -4,7 +4,7 @@ import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.main.截取pdf import truncate_pdf_multiple
-from flask_app.main.table_content_extraction import extract_tables_main
+from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -292,7 +292,7 @@ def get_technical_requirements(file_path,invalid_path):
                        5.不包含'说明'、'规格'、'技术参数'等列内容，仅返回采购的货物或系统或模块名称。

                        特殊情况处理：
-                        同一层级（如同一系统中）下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。
+                        若同一层级（如同一系统中）下存在同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增，规避重复键名的问题。

                        示例输出1，普通系统、货物类采购：
                        {{
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -20,7 +20,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
    }

    # 如果 procurement_docpath 是空字符串，直接返回包含空字符串的字典
-    if not procurement_docpath:
+    if not procurement_path:
        return DEFAULT_PROCUREMENT_REQS.copy()

    try:
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -78,7 +78,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
        'merged_baseinfo_path': merged_baseinfo_path
    }

-def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path, procurement_path, clause_path,logger):
+def fetch_project_basic_info(invalid_path,merged_baseinfo_path, procurement_path, clause_path,logger):
    logger.info("starting 基础信息...")
    start_time = time.time()
    if not merged_baseinfo_path:
@ -202,7 +202,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
                                                              processed_data['clause_path'],logger),
            'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path'],logger),
-            'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
+            'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
                                         processed_data['procurement_path'],processed_data['clause_path'],logger),
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],
                                                    processed_data['qualification_path'],