11.4

2024-11-04 17:13:06 +08:00 · 2024-11-04 17:13:06 +08:00 · 9f2fceb8c2
commit 9f2fceb8c2
parent 21351ddbe3
10 changed files with 242 additions and 105 deletions
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -1,3 +1,5 @@
 # -*- encoding:utf-8 -*-
 import json
 import re
 from functools import cmp_to_key
@ -196,26 +198,60 @@ def get_requirements_with_gpt(invalid_path, selection):
    file_id = upload_file(invalid_path)
    # 定义 selection 对应的用户查询
    user_queries = {
-        1: """
+        # 1: """
-            该招标文件中对投标文件的要求是什么？你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答，其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明，'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明，请以json格式返回给我结果，外层键名分别为'编写要求'，'格式'，'承诺书要求'，'递交要求'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键名应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下：
+        #     该招标文件中对投标文件的要求是什么？你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答，其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明，而不一定一致；'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明，而不一定一致；请以json格式返回给我结果，外层键名分别为'编写要求'，'格式'，'承诺书要求'，'递交要求'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键名应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下，内容与该文件无关：
        #        {
        #          "编写要求":"编写要求xxx",
        #          "格式要求":{
        #              "投标文件格式要求":"投标文件格式要求",
        #              "标记要求":"投标文件标记要求",
        #              "装订要求":"投标文件装订要求",
        #              "文件数量":"投标文件文件数量要求"
        #          },
        #          "承诺书要求":"承诺书要求xxx",
        #          "递交要求":{
        #              "投标地点":"投标地点",
        #              "投标文件交标方式":"交标方式",
        #              "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
        #          }
        #         }
        # """,
        1:"""
            该招标文件中对投标文件的要求有哪些？请以json格式给我返回结果，外层键名为文中提到的有关投标文件的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。示例如下，仅供格式参考：
            {
-                "编写要求":"投标函的编写要求xxx；法定代表人身份证明要求xx",
+                 "编写要求":"编写要求xxx",
-                "格式要求":{
+                 "格式要求":{
-                    "投标文件格式要求":"投标文件格式要求",
+                     "投标文件格式要求":"投标文件格式要求",
-                    "标记要求":"投标文件标记要求",
+                     "标记要求":"投标文件标记要求",
-                    "装订要求":"投标文件装订要求",
+                     "装订要求":"投标文件装订要求",
-                    "文件数量":"投标文件文件数量要求"
+                     "文件数量":"投标文件文件数量要求"
-                },
+                 },
-                "承诺书要求":"未知",
+                 "承诺书要求":"承诺书要求xxx",
-                "递交要求":{
+                 "递交要求":{
-                    "投标地点":"使用加密其投标文件的CA数字证书（企业锁）登录“电子交易系统”，进入“开标大厅”选择所投标段进行签到，并实时在线关注招标人的操作情况。",
+                     "递交投标文件的截止日期及递交地点":["递交投标文件的截止日期","递交投标文件的递交地点"],
-                    "投标文件交标方式":"线上开标",
+                     "投标文件交标方式":"交标方式",
-                    "投标文件的修改与撤回":"在投标人须知前附表规定的投标有效期内，投标人不得要求撤销或修改其投标文件。出现特殊情况需要延长投标有效期的，招标人以书面形式通知所有投标人延长投标有效期。投标人同意延长的，应相应延长其投标保证金的有效期，但不得要求或被允许修改或撤销其投标文件；投标人拒绝延长的，其投标失效，但投标人有权收回其投标保证金。"
+                     "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
                 }
                }
            }
        """,
-        2: """
+        # 2: """
-            该招标文件中开标、评标、定标要求(或磋商流程内容)是什么？你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答，其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明，'定标'可以从定标流程、履约能力的审查角度来说明，请以json格式返回给我结果，外层键名分别为'开标'、'开标异议'、'评标'、'定标'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键名应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下：
+        #     该招标文件中开标、评标、定标要求(或磋商流程内容)是什么？你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答，其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明，'定标'可以从定标流程、履约能力的审查角度来说明，请以json格式返回给我结果，外层键名分别为'开标'、'开标异议'、'评标'、'定标'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键值应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下：
        #     {
        #         "开标":"招标文件关于项目开标的要求",
        #         "开标异议":"招标文件中关于开标异议的项",
        #         "评标":{
        #             "特殊情况的处置":"因“电子交易系统”系统故障导致无法投标的，交易中心及时通知招标人，招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的，由投标人自行承担后果。",
        #             "评标办法及流程":"评标流程",
        #             "评标委员会的组建":"评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表，以及有关技术、经济等方面的专家组成。"
        #         },
        #         "定标":{
        #             "定标流程":"定标流程",
        #             "履约能力的审查":"履约能力的审查"
        #         }
        #     }
        # """,
        2:"""
        该招标文件中开评定标要求(或磋商流程内容)是什么？请以json格式给我返回结果，外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。示例如下，仅供格式参考：
            {
                "开标":"招标文件关于项目开标的要求",
                "开标异议":"招标文件中关于开标异议的项",
@ -231,7 +267,7 @@ def get_requirements_with_gpt(invalid_path, selection):
            }
        """,
        3: """
-            该招标文件中重新招标（或采购）、不再招标（或采购）、终止招标（或采购）的情况分别是什么？请以json格式返回给我结果，键名分别为'重新招标'，'不再招标'，'终止招标'，键值应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。示例输出如下：
+            该招标文件中重新招标（或重新采购）、不再招标（或不再采购）、终止招标（或终止采购）的情况分别是什么？请以json格式返回给我结果，键名分别为'重新招标'，'不再招标'，'终止招标'，键值应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。示例输出如下：
            {
                "重新招标":"有下列情形之一的，招标人将重新招标：（1）投标截止时间止，投标人少于3个的；（2）经评标委员会评审后否决所有投标的；",
                "不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的，属于必须审批或核准的工程建设项目，经原审批或核准部门批准后不再进行招标。",
@ -242,6 +278,7 @@ def get_requirements_with_gpt(invalid_path, selection):
    # 根据 selection 选择相应的 user_query
    user_query = user_queries.get(selection)
    if not user_query:
        return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
    # 调用大模型并处理响应
@ -250,4 +287,10 @@ def get_requirements_with_gpt(invalid_path, selection):
        cleaned_res = clean_json_string(res)
        return cleaned_res
    except Exception as e:
-        return {"error": "调用大模型失败"}
+        return {"error": "调用大模型失败"}
 if __name__ == "__main__":
    merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
    selection=2
    res=get_requirements_with_gpt(merged_baseinfo_path,selection)
    print(json.dumps(res,ensure_ascii=False,indent=4))
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -95,7 +95,7 @@ def extract_text_by_page(file_path):
 if __name__ == '__main__':
-    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
+    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
@ -103,5 +103,5 @@ if __name__ == '__main__':
    # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
    # ress = extract_common_header(file_path)
    # print(ress)
-    res=extract_text_by_page(input_path)
+    res=extract_text_by_page(file_path)
    # print(res)磋商文件_tobidders_notice_part2.pdf
--- a/flask_app/general/通用功能函数.py
+++ b/flask_app/general/通用功能函数.py
@ -9,7 +9,7 @@ from flask_app.general.多线程提问 import multi_threading
 from flask_app.general.通义千问long import upload_file
 from flask_app.main.判断是否分包等 import read_questions_from_judge
-def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, baseinfo_list1):
+def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1):
    judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
    judge_consortium = judge_consortium_bidding(baseinfo_list1)
    if judge_consortium:
@ -18,7 +18,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, base
            "外层键名为'联合体投标要求'，其中有一个嵌套键值对为：\"是否接受联合体投标\":\"是\""
        )
        judge_questions.append(judge_consortium_question)
-    file_id3 = upload_file(baseinfo_path)
+    file_id3 = upload_file(merged_baseinfo_path)
    res2 = multi_threading(judge_questions, "", file_id3, 2)
    if not res2:
--- a/flask_app/main/基础信息整合快速版.py
+++ b/flask_app/main/基础信息整合快速版.py
@ -143,7 +143,7 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice):
        return []
-def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
+def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path):
    """
    综合和处理基础信息，生成最终的基础信息字典。
@ -173,7 +173,7 @@ def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_tabl
        # 提交两个任务
        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1)
        future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice)
-        future3 = executor.submit(extract_from_notice, invalid_path, clause_path, 3)  # 新增的多线程任务
+        future3 = executor.submit(extract_from_notice, merged_baseinfo_path_more, clause_path, 3)  # 新增的多线程任务
        # 等待两个任务完成并获取结果
        future1.result()  # process_judge_questions 直接修改 baseinfo_list1，不需要返回值
--- a/flask_app/main/工程标解析main.py
+++ b/flask_app/main/工程标解析main.py
@ -1,11 +1,13 @@
 # -*- encoding:utf-8 -*-
 import json
 import logging
 import os
 import time
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt
 from flask_app.main.截取pdf import truncate_pdf_multiple
 from flask_app.general.merge_pdfs import merge_pdfs
 from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
@ -68,6 +70,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
    merged_baseinfo_path=truncate_files[-1]
    more_path=[merged_baseinfo_path,tobidders_notice]
    merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
    merge_pdfs(more_path,merged_baseinfo_path_more)
    clause_path = convert_clause_to_json(tobidders_notice, output_folder)  # 投标人须知正文条款pdf->json
    end_time=time.time()
    logger.info(f"文件预处理 done，耗时：{end_time - start_time:.2f} 秒")
@ -83,21 +88,24 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
        'qualification': qualification,
        'truncate0_jsonpath': truncate_jsonpath,
        'merged_baseinfo_path':merged_baseinfo_path,
        'merged_baseinfo_path_more':merged_baseinfo_path_more,
        'clause_path': clause_path,
        'invalid_docpath': invalid_docpath
    }
 # 基本信息
-def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
+def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo_path_more,tobidders_notice_table, tobidders_notice, clause_path):
    logger.info("starting 基础信息...")
    start_time = time.time()
    if not merged_baseinfo_path:
        merged_baseinfo_path = invalid_path
    if not merged_baseinfo_path_more:
        merged_baseinfo_path_more=invalid_path
    if not tobidders_notice_table:
        tobidders_notice_table = invalid_path
    if not tobidders_notice:
        tobidders_notice = invalid_path
-    basic_res = combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path)
+    basic_res = combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path)
    end_time = time.time()
    logger.info(f"基础信息 done，耗时：{end_time - start_time:.2f} 秒")
    return basic_res
@ -145,21 +153,25 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat
 # 投标文件要求
-def fetch_bidding_documents_requirements(invalid_path, clause_path):
+def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more,clause_path):
    logger.info("starting 投标文件要求...")
    start_time = time.time()
    if not merged_baseinfo_path_more:
        merged_baseinfo_path_more=invalid_path
    selection = 1
-    fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path, clause_path, selection)
+    fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
    end_time = time.time()
    logger.info(f"投标文件要求 done，耗时：{end_time - start_time:.2f} 秒")
    return {"投标文件要求": fetch_bidding_documents_requirements_json}
 # 开评定标流程
-def fetch_bid_opening(invalid_path, clause_path):
+def fetch_bid_opening(invalid_path, merged_baseinfo_path_more,clause_path):
    logger.info("starting 开评定标流程...")
    start_time = time.time()
    if not merged_baseinfo_path_more:
        merged_baseinfo_path_more=invalid_path
    selection = 2
-    fetch_bid_opening_json = extract_from_notice(invalid_path, clause_path, selection)
+    fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
    end_time = time.time()
    logger.info(f"开评定标流程 done，耗时：{end_time - start_time:.2f} 秒")
    return {"开评定标流程": fetch_bid_opening_json}
@ -177,7 +189,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 立即启动不依赖 knowledge_name 和 index 的任务
        futures = {
-            'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],
+            'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'],
                                         processed_data['tobidders_notice_table'],
                                         processed_data['tobidders_notice'], processed_data['clause_path']),
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'],
@ -189,8 +201,8 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder, processed_data['truncate0_jsonpath'],
                                                    processed_data['clause_path'], processed_data['qualification']),
-            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['clause_path']),
+            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']),
-            'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'], processed_data['clause_path'])
+            'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'])
        }
        # 提前处理这些不依赖的任务，按完成顺序返回
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@ -30,32 +30,32 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
    pdf_document = PdfReader(file_path)
    all_pages_text = []
    start_index = None
    # 处理所有页面
    for i, page in enumerate(pdf_document.pages):
        page_text = page.extract_text() if page.extract_text() else ""
        cleaned_text = clean_page_content(page_text, common_header)
-        # print(cleaned_text)
+
        # 在第一页查找开始位置
        if i == 0 and start_index is None:
-            start_match = re.search(start_word, cleaned_text, re.MULTILINE)
+            for pattern in (start_word if isinstance(start_word, list) else [start_word]):
-            if start_match:
+                start_match = re.search(pattern, cleaned_text)
-                start_index = start_match.start()
+                if start_match:
-                cleaned_text = cleaned_text[start_index:]
+                    start_index = start_match.start()
                    cleaned_text = cleaned_text[start_index:]
                    break  # 找到一个匹配后跳出循环
        # 在最后一页查找结束位置
        if i == len(pdf_document.pages) - 1:
-            for pattern in end_pattern:
+            matches = list(re.finditer(end_pattern, cleaned_text))
-                matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
+            if matches:
-                if matches:
+                end_index = matches[-1].start()
-                    end_index = matches[-1].start()
+                cleaned_text = cleaned_text[:end_index]
                    cleaned_text = cleaned_text[:end_index]
                    break
        all_pages_text.append(cleaned_text)
    # 合并所有页面的文本
    full_text = "\n".join(all_pages_text)
    # print(full_text)
    return full_text
 def extract_section(text, start_pattern, end_phrases):
    # 查找开始模式
@ -113,8 +113,16 @@ def parse_text_by_heading(text):
    current_content = []
    append_newline = False
    lines = text.split('\n')
    def check_year_pattern(line):
        line_no_spaces = line.replace(' ', '')
        return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == '年'
    for i, line in enumerate(lines):    #由于本身就是按行读取处理，因此保存的时候不带'\n'
        line_stripped = line.strip().replace('．', '.')
        # 检查是否以年份开头，如果是，属于上一个标题内容
        if check_year_pattern(line_stripped) and current_key is not None:
            current_content.append(line_stripped)
            continue
        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
@ -168,31 +176,58 @@ def convert_to_json(file_path, start_word, end_phrases):
    parsed_data = parse_text_by_heading(text)
    return parsed_data
-def convert_clause_to_json(input_path,output_folder,type=1):
+
 def convert_clause_to_json(input_path, output_folder, type=1):
    if not os.path.exists(input_path):
        # print(f"The specified file does not exist: {input_path}")
        return ""
-    if type==1:
+
-        start_word = "投标人须知正文"
+    if type == 1:
-        end_phrases = [
+        # start_word 和编译后的 end_pattern 用于 type=1
-            r'^第[一二三四五六七八九十]+章\s*评标办法',
+        start_word = [
-            r'^评标办法前附表',
+            re.compile(
-            r'^附(?:录|件|表)(?:一)?[:：]'
+                r'^\s*(?:[（(]\s*[一1]?\s*[)）]\s*[、．.]*|[一1][、．.]+|[、．.]+)\s*(说\s*明|总\s*则)' 
                r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)',
                re.MULTILINE
            )
        ]
        end_pattern = re.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
            r'^评标办法前附表|'
            r'^附录(?:一)?[：:]|'
            r'^附件(?:一)?[：:]|'
            r'^附表(?:一)?[：:]',
            re.MULTILINE
        )
    else:
-        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：|投标邀请书|投标邀请函'
+        start_word = [
-        end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
+            re.compile(
-    result = convert_to_json(input_path, start_word, end_phrases)   #过滤无关信息
+                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
                re.MULTILINE
            ),
            re.compile(
                r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)）]?\s*$',
                re.MULTILINE
            )
        ]
        end_pattern = re.compile(
            r'第[一二三四五六七八九十]+章\s*投标人须知|'
            r'投标人须知前附表',
            re.MULTILINE
        )
    result = convert_to_json(input_path, start_word, end_pattern)  # 过滤无关信息
    # 检查输出文件夹是否存在，如果不存在则创建
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
    file_name = "clause1.json" if type == 1 else "clause2.json"
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
-    # post_process_json(output_path)
+
    # print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
    return output_path
 # def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
@ -237,16 +272,16 @@ def convert_clause_to_json(input_path,output_folder,type=1):
    #     json.dump(processed_data, file, ensure_ascii=False, indent=4)
 if __name__ == "__main__":
-    file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\东莞支队查验招标文件_tobidders_notice_part2_1-5.pdf'
+    file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
    # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part1.pdf'
    # start_word = "投标人须知正文"
    # end_phrases = [
    #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
    #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
    # ]
-    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
    try:
-        output_path = convert_clause_to_json("",output_folder)
+        output_path = convert_clause_to_json(file_path,output_folder)
        print(f"Final JSON result saved to: {output_path}")
    except ValueError as e:
        print("Error:", e)
--- a/flask_app/货物标/基础信息解析main.py
+++ b/flask_app/货物标/基础信息解析main.py
@ -114,8 +114,8 @@ def dynamic_key_handling(key_groups, detected_keys):
                if key not in group:
                    group.append(key)
-def get_base_info(baseinfo_file_path,clause_path):
+def get_base_info(merged_baseinfo_path,clause_path):
-    file_id = upload_file(baseinfo_file_path)
+    file_id = upload_file(merged_baseinfo_path)
    baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
    # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
    questions = read_questions_from_file(baseinfo_file_path)
@ -129,9 +129,9 @@ def get_base_info(baseinfo_file_path,clause_path):
    # judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        # 提交两个任务
-        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, baseinfo_file_path,
+        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path,
                                  baseinfo_list)
-        future2 = executor.submit(extract_from_notice, baseinfo_file_path, clause_path, 3)  # 新增的多线程任务
+        future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3)  # 新增的多线程任务
        # 等待两个任务完成并获取结果
        future1.result()  # process_judge_questions 直接修改 baseinfo_list，不需要返回值
@ -158,14 +158,14 @@ def get_base_info(baseinfo_file_path,clause_path):
    #         baseinfo_list.append(clean_json_string(response))
    return baseinfo_list
-def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,clause_path):
+def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path):
    baseinfo_list = []
    temp_list = []
    procurement_reqs = {}
    # 定义一个线程函数来获取基础信息
    def get_base_info_thread():
        nonlocal temp_list
-        temp_list = get_base_info(baseinfo_file_path,clause_path)
+        temp_list = get_base_info(merged_baseinfo_path,clause_path)
    # 定义一个线程函数来获取采购需求
    def fetch_procurement_reqs_thread():
        nonlocal procurement_reqs
@ -191,11 +191,11 @@ def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,
 if __name__ == "__main__":
    start_time=time.time()
    # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
-    baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
+    merged_baseinfo_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
    # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
    procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
    clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
-    res = combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path)
+    res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
    print("------------------------------------")
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()
--- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
+++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
@ -62,7 +62,7 @@ post_process 函数尝试将长字符串按特定模式分割成块，每块至
 """
 # 读取JSON数据，提取内容，转换结构，并打印结果
-def extract_from_notice(invalid_path,clause_path, type):
+def extract_from_notice(merged_baseinfo_path,clause_path, type):
    """
    从公告中提取特定类型的内容。
    Args:
@ -106,7 +106,7 @@ def extract_from_notice(invalid_path,clause_path, type):
        transformed_data = process_with_outer_key(extracted_data)
        final_result = process_nested_data(transformed_data)
        if not final_result:
-            final_result = get_requirements_with_gpt(invalid_path, type)    #万一没用正则匹配到，那就调用大模型
+            final_result = get_requirements_with_gpt(merged_baseinfo_path, type)    #万一没用正则匹配到，那就调用大模型
        return final_result
    except Exception as e:
@ -114,11 +114,11 @@ def extract_from_notice(invalid_path,clause_path, type):
        return DEFAULT_RESULT
 if __name__ == "__main__":
-    file_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
+    clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json'
-    invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile.pdf"
+    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
    try:
-        res = extract_from_notice(invalid_path,file_path, 1)  # 可以改变此处的 type 参数测试不同的场景
+        res = extract_from_notice(merged_baseinfo_path,clause_path, 1)  # 可以改变此处的 type 参数测试不同的场景
        res2=json.dumps(res,ensure_ascii=False,indent=4)
        print(res2)
    except ValueError as e:
--- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
@ -16,6 +16,7 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
    pdf_document = PdfReader(file_path)
    exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
    all_pages_text = []
    start_index = None
    # 处理所有页面
@ -132,18 +133,35 @@ def parse_text_by_heading(text):
    append_newline = False
    skip_subheadings = False
    last_main_number = None
    temp_title = None  # 临时存储以点号开头但不带数字的标题
    lines = text.split('\n')
    def check_year_pattern(line):
        # 检查是否为年份模式，如 '2014年'
        line_no_spaces = line.replace(' ', '')
        return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
    for i, line in enumerate(lines):
        line_stripped = line.strip().replace('．', '.')
        # 如果在一个章节内，跳过年份模式的行
        if check_year_pattern(line_stripped) and current_key is not None:
            current_content.append(line_stripped)
            continue
        # 匹配带数字的标题，例如 '12.1 内容'
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
-        # 新增：处理以点号开头的情况
+        # 匹配以点号开头并带有数字的情况，例如 '.12.1 内容'
-        dot_match = re.match(r'^\.(\d+(?:\.\d+)*)\s*(.+)$', line_stripped)
+        dot_match = re.match(r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$', line_stripped)
-        # 新增：处理没有点号的纯数字开头的情况
+        # 匹配以点号开头但不带数字的情况，例如 '.投标文件的制作和签署'
        dot_text_match = re.match(r'^[．.]\s*(\D.+)$', line_stripped)
        # 匹配不带点号的纯数字开头的情况，例如 '27xxxxx'
        pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
        if match:
@ -160,6 +178,14 @@ def parse_text_by_heading(text):
                data[current_key_chinese] = current_value_chinese
                current_key_chinese = None
            # 处理临时标题与新主标题的关联
            if temp_title:
                # 提取主编号（第一个点前的数字）
                main_number = new_key.split('.')[0]
                # 关联临时标题到主编号
                data[f"{main_number}."] = temp_title
                temp_title = None  # 重置临时标题
            # 保存阿拉伯数字的标题内容
            if current_key is None or (compare_headings(current_key, new_key) and (
                    len(current_content) == 0 or current_content[-1][-1] != '第')):
@ -170,20 +196,13 @@ def parse_text_by_heading(text):
                current_content = [line_content]
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                last_main_number = new_key.split('.')[0]
            # if current_key is None or (current_key != new_key and (     #不给序号排序
            #         len(current_content) == 0 or current_content[-1][-1] != '第')):
            #     if current_key is not None:
            #         content_string = ''.join(current_content).strip()
            #         data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
            #     current_key = new_key
            #     current_content = [line_content]
            #     append_newline = len(new_key.rstrip('.').split('.')) <= 2
            #     last_main_number = new_key.split('.')[0]
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
        elif dot_match:
-            # 处理以点号开头的情况
+            # 处理以点号开头并带有数字的情况
            sub_number, line_content = dot_match.groups()
            sub_number = sub_number.replace('．', '.')
            if last_main_number:
                new_key = f"{last_main_number}.{sub_number}"
                if current_key is not None:
@ -194,7 +213,13 @@ def parse_text_by_heading(text):
                append_newline = True
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
-        elif pure_number_match:  # 处理没有点号的纯数字开头的情况'27xxxxx'
+
        elif dot_text_match:
            # 处理以点号开头但不带数字的情况，存储到临时变量
            temp_title = dot_text_match.group(1).strip()
            continue  # 跳过进一步处理该行
        elif pure_number_match:  # 处理不带点号的纯数字开头的情况，例如 '27xxxxx'
            new_key_candidate, line_content = pure_number_match.groups()
            new_key_candidate += '.'  # 添加点号
            line_content = line_content.lstrip('.．、,')
@ -208,6 +233,12 @@ def parse_text_by_heading(text):
            # 判断新键名是否为新的标题，条件是键名递增且差值在5以内
            if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
                # 处理临时标题与新主标题的关联
                if temp_title:
                    main_number = new_key_candidate.split('.')[0]
                    data[f"{main_number}."] = temp_title
                    temp_title = None  # 重置临时标题
                # 开始新的标题
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
@ -216,19 +247,28 @@ def parse_text_by_heading(text):
                current_content = [line_content]
                append_newline = True
                last_main_number = new_key_candidate.rstrip('.')
                # if current_key is None or (current_key != new_key and (     #不给序号排序
                #         len(current_content) == 0 or current_content[-1][-1] != '第')):
                #     if current_key is not None:
                #         content_string = ''.join(current_content).strip()
                #         data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
                #     current_key = new_key
                #     current_content = [line_content]
                #     append_newline = len(new_key.rstrip('.').split('.')) <= 2
                #     last_main_number = new_key.split('.')[0]
            else:
                # 将当前行视为当前标题的内容
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
        else:
-            if not skip_subheadings:
+            if not skip_subheadings:   # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑
-                # 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑
+                pattern_title = re.compile(
-                pattern_title = re.compile(r'^\s*(?:[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
+                    r'^\s*(?:[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
                chinese_match = pattern_title.match(line_stripped)
                letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
                arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
-                # 优化了处理逻辑，减少冗余
+                # 优化处理逻辑，减少冗余
                if chinese_match or letter_match or arabic_match:
                    # 保存之前的 key 的内容（无论是中文标题还是阿拉伯数字标题）
                    if current_key is not None:
@ -277,6 +317,9 @@ def parse_text_by_heading(text):
        data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
    if current_key_chinese is not None:
        data[current_key_chinese] = current_value_chinese
    if temp_title:
        # 处理任何未关联的临时标题
        data["未分类标题"] = temp_title  # 您可以根据需要选择合适的键名
    return data
@ -364,13 +407,13 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
        print(f"The specified file does not exist: {file_path}")
        return ""
    if type == 1:
-        start_word = r'^\s*[（(]?\s*[一1]\s*[)）]?\s*[、.]*\s*(说\s*明|总\s*则)'
+        start_word = r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则)'
-        end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
+        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
        text = extract_text_from_pdf(file_path, start_word, end_pattern)
        result = parse_text_by_heading(text)
    else:
-        start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
+        start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*)$'
-        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
+        end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
        text = extract_text_from_pdf(file_path, start_word, end_pattern)
        result=parse_text_to_dict(text)
    # result = convert_to_json(input_path, start_word, end_pattern)
@ -407,9 +450,9 @@ def process_folder(input_folder, output_folder):
 #TODO: 投标人须知正文这块，序号可能是乱序的，或许可以删除判断序号大小的逻辑，只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf'
+    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
-    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp'
+    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder,1)
        print(f"Final JSON result saved to: {output_path}")
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -129,22 +129,26 @@ def fetch_invalid_requirements(invalid_docpath, output_folder):
    logger.info(f"无效标与废标 done，耗时：{end_time - start_time:.2f} 秒")
    return find_invalid_res
-def fetch_bidding_documents_requirements(invalid_path,clause_path):
+def fetch_bidding_documents_requirements(invalid_path,merged_baseinfo_path,clause_path):
    logger.info("starting 投标文件要求...")
    if not merged_baseinfo_path:
        merged_baseinfo_path=invalid_path
    start_time = time.time()
    selection=1
-    fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection)
+    fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
    end_time = time.time()
    logger.info(f"投标文件要求 done，耗时：{end_time - start_time:.2f} 秒")
    return {"投标文件要求": fetch_bidding_documents_requirements_json}
 # 开评定标流程
-def fetch_bid_opening(invalid_path,clause_path):
+def fetch_bid_opening(invalid_path,merged_baseinfo_path,clause_path):
    logger.info("starting 开评定标流程...")
    if not merged_baseinfo_path:
        merged_baseinfo_path=invalid_path
    start_time = time.time()
    selection=2
-    fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection)
+    fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
    end_time = time.time()
    logger.info(f"开评定标流程 done，耗时：{end_time - start_time:.2f} 秒")
    return {"开评定标流程": fetch_bid_opening_json}
@ -201,9 +205,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
                                                    processed_data['evaluation_method_path']),
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder),
-            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],
+            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
                                                              processed_data['clause_path']),
-            'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']),
+            'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path']),
            'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
                                         processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']),
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder,