11.4

2024-11-04 17:13:06 +08:00 · 2024-11-04 17:13:06 +08:00 · 9f2fceb8c2
commit 9f2fceb8c2
parent 21351ddbe3
10 changed files with 242 additions and 105 deletions
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -1,3 +1,5 @@
+# -*- encoding:utf-8 -*-
+import json
 import re
 from functools import cmp_to_key

@ -196,26 +198,60 @@ def get_requirements_with_gpt(invalid_path, selection):
    file_id = upload_file(invalid_path)
    # 定义 selection 对应的用户查询
    user_queries = {
-        1: """
-            该招标文件中对投标文件的要求是什么？你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答，其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明，'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明，请以json格式返回给我结果，外层键名分别为'编写要求'，'格式'，'承诺书要求'，'递交要求'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键名应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下：
+        # 1: """
+        #     该招标文件中对投标文件的要求是什么？你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答，其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明，而不一定一致；'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明，而不一定一致；请以json格式返回给我结果，外层键名分别为'编写要求'，'格式'，'承诺书要求'，'递交要求'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键名应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下，内容与该文件无关：
+        #        {
+        #          "编写要求":"编写要求xxx",
+        #          "格式要求":{
+        #              "投标文件格式要求":"投标文件格式要求",
+        #              "标记要求":"投标文件标记要求",
+        #              "装订要求":"投标文件装订要求",
+        #              "文件数量":"投标文件文件数量要求"
+        #          },
+        #          "承诺书要求":"承诺书要求xxx",
+        #          "递交要求":{
+        #              "投标地点":"投标地点",
+        #              "投标文件交标方式":"交标方式",
+        #              "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
+        #          }
+        #         }
+        # """,
+        1:"""
+            该招标文件中对投标文件的要求有哪些？请以json格式给我返回结果，外层键名为文中提到的有关投标文件的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。示例如下，仅供格式参考：
            {
-                "编写要求":"投标函的编写要求xxx；法定代表人身份证明要求xx",
-                "格式要求":{
-                    "投标文件格式要求":"投标文件格式要求",
-                    "标记要求":"投标文件标记要求",
-                    "装订要求":"投标文件装订要求",
-                    "文件数量":"投标文件文件数量要求"
-                },
-                "承诺书要求":"未知",
-                "递交要求":{
-                    "投标地点":"使用加密其投标文件的CA数字证书（企业锁）登录“电子交易系统”，进入“开标大厅”选择所投标段进行签到，并实时在线关注招标人的操作情况。",
-                    "投标文件交标方式":"线上开标",
-                    "投标文件的修改与撤回":"在投标人须知前附表规定的投标有效期内，投标人不得要求撤销或修改其投标文件。出现特殊情况需要延长投标有效期的，招标人以书面形式通知所有投标人延长投标有效期。投标人同意延长的，应相应延长其投标保证金的有效期，但不得要求或被允许修改或撤销其投标文件；投标人拒绝延长的，其投标失效，但投标人有权收回其投标保证金。"
+                 "编写要求":"编写要求xxx",
+                 "格式要求":{
+                     "投标文件格式要求":"投标文件格式要求",
+                     "标记要求":"投标文件标记要求",
+                     "装订要求":"投标文件装订要求",
+                     "文件数量":"投标文件文件数量要求"
+                 },
+                 "承诺书要求":"承诺书要求xxx",
+                 "递交要求":{
+                     "递交投标文件的截止日期及递交地点":["递交投标文件的截止日期","递交投标文件的递交地点"],
+                     "投标文件交标方式":"交标方式",
+                     "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
+                 }
                }
-            }
        """,
-        2: """
-            该招标文件中开标、评标、定标要求(或磋商流程内容)是什么？你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答，其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明，'定标'可以从定标流程、履约能力的审查角度来说明，请以json格式返回给我结果，外层键名分别为'开标'、'开标异议'、'评标'、'定标'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键名应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下：
+        # 2: """
+        #     该招标文件中开标、评标、定标要求(或磋商流程内容)是什么？你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答，其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明，'定标'可以从定标流程、履约能力的审查角度来说明，请以json格式返回给我结果，外层键名分别为'开标'、'开标异议'、'评标'、'定标'，你可以用嵌套键值对组织回答，嵌套键名为你对相关子要求的总结，而嵌套键值应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。输出格式示例如下：
+        #     {
+        #         "开标":"招标文件关于项目开标的要求",
+        #         "开标异议":"招标文件中关于开标异议的项",
+        #         "评标":{
+        #             "特殊情况的处置":"因“电子交易系统”系统故障导致无法投标的，交易中心及时通知招标人，招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的，由投标人自行承担后果。",
+        #             "评标办法及流程":"评标流程",
+        #             "评标委员会的组建":"评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表，以及有关技术、经济等方面的专家组成。"
+        #         },
+        #         "定标":{
+        #             "定标流程":"定标流程",
+        #             "履约能力的审查":"履约能力的审查"
+        #         }
+        #     }
+        # """,
+        2:"""
+        该招标文件中开评定标要求(或磋商流程内容)是什么？请以json格式给我返回结果，外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。示例如下，仅供格式参考：
            {
                "开标":"招标文件关于项目开标的要求",
                "开标异议":"招标文件中关于开标异议的项",
@ -231,7 +267,7 @@ def get_requirements_with_gpt(invalid_path, selection):
            }
        """,
        3: """
-            该招标文件中重新招标（或采购）、不再招标（或采购）、终止招标（或采购）的情况分别是什么？请以json格式返回给我结果，键名分别为'重新招标'，'不再招标'，'终止招标'，键值应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。示例输出如下：
+            该招标文件中重新招标（或重新采购）、不再招标（或不再采购）、终止招标（或终止采购）的情况分别是什么？请以json格式返回给我结果，键名分别为'重新招标'，'不再招标'，'终止招标'，键值应该完全与原文内容保持一致，不得擅自总结删减，如果原文中未提及相关内容，在键值中填'未知'。示例输出如下：
            {
                "重新招标":"有下列情形之一的，招标人将重新招标：（1）投标截止时间止，投标人少于3个的；（2）经评标委员会评审后否决所有投标的；",
                "不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的，属于必须审批或核准的工程建设项目，经原审批或核准部门批准后不再进行招标。",
@ -242,6 +278,7 @@ def get_requirements_with_gpt(invalid_path, selection):

    # 根据 selection 选择相应的 user_query
    user_query = user_queries.get(selection)
+
    if not user_query:
        return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
    # 调用大模型并处理响应
@ -251,3 +288,9 @@ def get_requirements_with_gpt(invalid_path, selection):
        return cleaned_res
    except Exception as e:
        return {"error": "调用大模型失败"}
+
+if __name__ == "__main__":
+    merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
+    selection=2
+    res=get_requirements_with_gpt(merged_baseinfo_path,selection)
+    print(json.dumps(res,ensure_ascii=False,indent=4))
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -95,7 +95,7 @@ def extract_text_by_page(file_path):


 if __name__ == '__main__':
-    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
+    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
@ -103,5 +103,5 @@ if __name__ == '__main__':
    # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
    # ress = extract_common_header(file_path)
    # print(ress)
-    res=extract_text_by_page(input_path)
+    res=extract_text_by_page(file_path)
    # print(res)磋商文件_tobidders_notice_part2.pdf
--- a/flask_app/general/通用功能函数.py
+++ b/flask_app/general/通用功能函数.py
@ -9,7 +9,7 @@ from flask_app.general.多线程提问 import multi_threading
 from flask_app.general.通义千问long import upload_file
 from flask_app.main.判断是否分包等 import read_questions_from_judge

-def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, baseinfo_list1):
+def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1):
    judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
    judge_consortium = judge_consortium_bidding(baseinfo_list1)
    if judge_consortium:
@ -18,7 +18,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, base
            "外层键名为'联合体投标要求'，其中有一个嵌套键值对为：\"是否接受联合体投标\":\"是\""
        )
        judge_questions.append(judge_consortium_question)
-    file_id3 = upload_file(baseinfo_path)
+    file_id3 = upload_file(merged_baseinfo_path)
    res2 = multi_threading(judge_questions, "", file_id3, 2)

    if not res2:
--- a/flask_app/main/基础信息整合快速版.py
+++ b/flask_app/main/基础信息整合快速版.py
@ -143,7 +143,7 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice):
        return []


-def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
+def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path):
    """
    综合和处理基础信息，生成最终的基础信息字典。

@ -173,7 +173,7 @@ def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_tabl
        # 提交两个任务
        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1)
        future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice)
-        future3 = executor.submit(extract_from_notice, invalid_path, clause_path, 3)  # 新增的多线程任务
+        future3 = executor.submit(extract_from_notice, merged_baseinfo_path_more, clause_path, 3)  # 新增的多线程任务

        # 等待两个任务完成并获取结果
        future1.result()  # process_judge_questions 直接修改 baseinfo_list1，不需要返回值
--- a/flask_app/main/工程标解析main.py
+++ b/flask_app/main/工程标解析main.py
@ -1,11 +1,13 @@
 # -*- encoding:utf-8 -*-
 import json
 import logging
+import os
 import time
 from concurrent.futures import ThreadPoolExecutor

 from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt
 from flask_app.main.截取pdf import truncate_pdf_multiple
+from flask_app.general.merge_pdfs import merge_pdfs
 from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
@ -68,6 +70,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):


    merged_baseinfo_path=truncate_files[-1]
+    more_path=[merged_baseinfo_path,tobidders_notice]
+    merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
+    merge_pdfs(more_path,merged_baseinfo_path_more)
    clause_path = convert_clause_to_json(tobidders_notice, output_folder)  # 投标人须知正文条款pdf->json
    end_time=time.time()
    logger.info(f"文件预处理 done，耗时：{end_time - start_time:.2f} 秒")
@ -83,21 +88,24 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
        'qualification': qualification,
        'truncate0_jsonpath': truncate_jsonpath,
        'merged_baseinfo_path':merged_baseinfo_path,
+        'merged_baseinfo_path_more':merged_baseinfo_path_more,
        'clause_path': clause_path,
        'invalid_docpath': invalid_docpath
    }
 # 基本信息

-def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
+def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo_path_more,tobidders_notice_table, tobidders_notice, clause_path):
    logger.info("starting 基础信息...")
    start_time = time.time()
    if not merged_baseinfo_path:
        merged_baseinfo_path = invalid_path
+    if not merged_baseinfo_path_more:
+        merged_baseinfo_path_more=invalid_path
    if not tobidders_notice_table:
        tobidders_notice_table = invalid_path
    if not tobidders_notice:
        tobidders_notice = invalid_path
-    basic_res = combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path)
+    basic_res = combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path)
    end_time = time.time()
    logger.info(f"基础信息 done，耗时：{end_time - start_time:.2f} 秒")
    return basic_res
@ -145,21 +153,25 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat


 # 投标文件要求
-def fetch_bidding_documents_requirements(invalid_path, clause_path):
+def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more,clause_path):
    logger.info("starting 投标文件要求...")
    start_time = time.time()
+    if not merged_baseinfo_path_more:
+        merged_baseinfo_path_more=invalid_path
    selection = 1
-    fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path, clause_path, selection)
+    fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
    end_time = time.time()
    logger.info(f"投标文件要求 done，耗时：{end_time - start_time:.2f} 秒")
    return {"投标文件要求": fetch_bidding_documents_requirements_json}

 # 开评定标流程
-def fetch_bid_opening(invalid_path, clause_path):
+def fetch_bid_opening(invalid_path, merged_baseinfo_path_more,clause_path):
    logger.info("starting 开评定标流程...")
    start_time = time.time()
+    if not merged_baseinfo_path_more:
+        merged_baseinfo_path_more=invalid_path
    selection = 2
-    fetch_bid_opening_json = extract_from_notice(invalid_path, clause_path, selection)
+    fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
    end_time = time.time()
    logger.info(f"开评定标流程 done，耗时：{end_time - start_time:.2f} 秒")
    return {"开评定标流程": fetch_bid_opening_json}
@ -177,7 +189,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 立即启动不依赖 knowledge_name 和 index 的任务
        futures = {
-            'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],
+            'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'],
                                         processed_data['tobidders_notice_table'],
                                         processed_data['tobidders_notice'], processed_data['clause_path']),
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'],
@ -189,8 +201,8 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder, processed_data['truncate0_jsonpath'],
                                                    processed_data['clause_path'], processed_data['qualification']),
-            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['clause_path']),
-            'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'], processed_data['clause_path'])
+            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']),
+            'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'])
        }

        # 提前处理这些不依赖的任务，按完成顺序返回
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@ -30,32 +30,32 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
    pdf_document = PdfReader(file_path)
    all_pages_text = []
    start_index = None
+
    # 处理所有页面
    for i, page in enumerate(pdf_document.pages):
        page_text = page.extract_text() if page.extract_text() else ""
        cleaned_text = clean_page_content(page_text, common_header)
-        # print(cleaned_text)
+
        # 在第一页查找开始位置
        if i == 0 and start_index is None:
-            start_match = re.search(start_word, cleaned_text, re.MULTILINE)
-            if start_match:
-                start_index = start_match.start()
-                cleaned_text = cleaned_text[start_index:]
+            for pattern in (start_word if isinstance(start_word, list) else [start_word]):
+                start_match = re.search(pattern, cleaned_text)
+                if start_match:
+                    start_index = start_match.start()
+                    cleaned_text = cleaned_text[start_index:]
+                    break  # 找到一个匹配后跳出循环

        # 在最后一页查找结束位置
        if i == len(pdf_document.pages) - 1:
-            for pattern in end_pattern:
-                matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
-                if matches:
-                    end_index = matches[-1].start()
-                    cleaned_text = cleaned_text[:end_index]
-                    break
+            matches = list(re.finditer(end_pattern, cleaned_text))
+            if matches:
+                end_index = matches[-1].start()
+                cleaned_text = cleaned_text[:end_index]

        all_pages_text.append(cleaned_text)

    # 合并所有页面的文本
    full_text = "\n".join(all_pages_text)
-    # print(full_text)
    return full_text
 def extract_section(text, start_pattern, end_phrases):
    # 查找开始模式
@ -113,8 +113,16 @@ def parse_text_by_heading(text):
    current_content = []
    append_newline = False
    lines = text.split('\n')
+
+    def check_year_pattern(line):
+        line_no_spaces = line.replace(' ', '')
+        return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == '年'
    for i, line in enumerate(lines):    #由于本身就是按行读取处理，因此保存的时候不带'\n'
        line_stripped = line.strip().replace('．', '.')
+        # 检查是否以年份开头，如果是，属于上一个标题内容
+        if check_year_pattern(line_stripped) and current_key is not None:
+            current_content.append(line_stripped)
+            continue
        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
@ -168,31 +176,58 @@ def convert_to_json(file_path, start_word, end_phrases):
    parsed_data = parse_text_by_heading(text)
    return parsed_data

-def convert_clause_to_json(input_path,output_folder,type=1):
+
+def convert_clause_to_json(input_path, output_folder, type=1):
    if not os.path.exists(input_path):
-        # print(f"The specified file does not exist: {input_path}")
        return ""
-    if type==1:
-        start_word = "投标人须知正文"
-        end_phrases = [
-            r'^第[一二三四五六七八九十]+章\s*评标办法',
-            r'^评标办法前附表',
-            r'^附(?:录|件|表)(?:一)?[:：]'
+
+    if type == 1:
+        # start_word 和编译后的 end_pattern 用于 type=1
+        start_word = [
+            re.compile(
+                r'^\s*(?:[（(]\s*[一1]?\s*[)）]\s*[、．.]*|[一1][、．.]+|[、．.]+)\s*(说\s*明|总\s*则)' 
+                r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)',
+                re.MULTILINE
+            )
        ]
+        end_pattern = re.compile(
+            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
+            r'^评标办法前附表|'
+            r'^附录(?:一)?[：:]|'
+            r'^附件(?:一)?[：:]|'
+            r'^附表(?:一)?[：:]',
+            re.MULTILINE
+        )
    else:
-        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：|投标邀请书|投标邀请函'
-        end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
-    result = convert_to_json(input_path, start_word, end_phrases)   #过滤无关信息
+        start_word = [
+            re.compile(
+                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
+                re.MULTILINE
+            ),
+            re.compile(
+                r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)）]?\s*$',
+                re.MULTILINE
+            )
+        ]
+        end_pattern = re.compile(
+            r'第[一二三四五六七八九十]+章\s*投标人须知|'
+            r'投标人须知前附表',
+            re.MULTILINE
+        )
+
+    result = convert_to_json(input_path, start_word, end_pattern)  # 过滤无关信息
+
    # 检查输出文件夹是否存在，如果不存在则创建
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
+
    file_name = "clause1.json" if type == 1 else "clause2.json"
    output_path = os.path.join(output_folder, file_name)
+
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
-    # post_process_json(output_path)
-    # print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
+
    return output_path

 # def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
@ -237,16 +272,16 @@ def convert_clause_to_json(input_path,output_folder,type=1):
    #     json.dump(processed_data, file, ensure_ascii=False, indent=4)

 if __name__ == "__main__":
-    file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\东莞支队查验招标文件_tobidders_notice_part2_1-5.pdf'
+    file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
    # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part1.pdf'
    # start_word = "投标人须知正文"
    # end_phrases = [
    #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
    #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
    # ]
-    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
    try:
-        output_path = convert_clause_to_json("",output_folder)
+        output_path = convert_clause_to_json(file_path,output_folder)
        print(f"Final JSON result saved to: {output_path}")
    except ValueError as e:
        print("Error:", e)
--- a/flask_app/货物标/基础信息解析main.py
+++ b/flask_app/货物标/基础信息解析main.py
@ -114,8 +114,8 @@ def dynamic_key_handling(key_groups, detected_keys):
                if key not in group:
                    group.append(key)

-def get_base_info(baseinfo_file_path,clause_path):
-    file_id = upload_file(baseinfo_file_path)
+def get_base_info(merged_baseinfo_path,clause_path):
+    file_id = upload_file(merged_baseinfo_path)
    baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
    # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
    questions = read_questions_from_file(baseinfo_file_path)
@ -129,9 +129,9 @@ def get_base_info(baseinfo_file_path,clause_path):
    # judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        # 提交两个任务
-        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, baseinfo_file_path,
+        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path,
                                  baseinfo_list)
-        future2 = executor.submit(extract_from_notice, baseinfo_file_path, clause_path, 3)  # 新增的多线程任务
+        future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3)  # 新增的多线程任务

        # 等待两个任务完成并获取结果
        future1.result()  # process_judge_questions 直接修改 baseinfo_list，不需要返回值
@ -158,14 +158,14 @@ def get_base_info(baseinfo_file_path,clause_path):
    #         baseinfo_list.append(clean_json_string(response))
    return baseinfo_list

-def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,clause_path):
+def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path):
    baseinfo_list = []
    temp_list = []
    procurement_reqs = {}
    # 定义一个线程函数来获取基础信息
    def get_base_info_thread():
        nonlocal temp_list
-        temp_list = get_base_info(baseinfo_file_path,clause_path)
+        temp_list = get_base_info(merged_baseinfo_path,clause_path)
    # 定义一个线程函数来获取采购需求
    def fetch_procurement_reqs_thread():
        nonlocal procurement_reqs
@ -191,11 +191,11 @@ def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,
 if __name__ == "__main__":
    start_time=time.time()
    # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
-    baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
+    merged_baseinfo_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
    # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
    procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
    clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
-    res = combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path)
+    res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
    print("------------------------------------")
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()
--- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
+++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
@ -62,7 +62,7 @@ post_process 函数尝试将长字符串按特定模式分割成块，每块至
 """

 # 读取JSON数据，提取内容，转换结构，并打印结果
-def extract_from_notice(invalid_path,clause_path, type):
+def extract_from_notice(merged_baseinfo_path,clause_path, type):
    """
    从公告中提取特定类型的内容。
    Args:
@ -106,7 +106,7 @@ def extract_from_notice(invalid_path,clause_path, type):
        transformed_data = process_with_outer_key(extracted_data)
        final_result = process_nested_data(transformed_data)
        if not final_result:
-            final_result = get_requirements_with_gpt(invalid_path, type)    #万一没用正则匹配到，那就调用大模型
+            final_result = get_requirements_with_gpt(merged_baseinfo_path, type)    #万一没用正则匹配到，那就调用大模型
        return final_result

    except Exception as e:
@ -114,11 +114,11 @@ def extract_from_notice(invalid_path,clause_path, type):
        return DEFAULT_RESULT

 if __name__ == "__main__":
-    file_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
-    invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile.pdf"
+    clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json'
+    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
    try:
-        res = extract_from_notice(invalid_path,file_path, 1)  # 可以改变此处的 type 参数测试不同的场景
+        res = extract_from_notice(merged_baseinfo_path,clause_path, 1)  # 可以改变此处的 type 参数测试不同的场景
        res2=json.dumps(res,ensure_ascii=False,indent=4)
        print(res2)
    except ValueError as e:
--- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
@ -16,6 +16,7 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
    pdf_document = PdfReader(file_path)
+    exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
    all_pages_text = []
    start_index = None
    # 处理所有页面
@ -132,18 +133,35 @@ def parse_text_by_heading(text):
    append_newline = False
    skip_subheadings = False
    last_main_number = None
+    temp_title = None  # 临时存储以点号开头但不带数字的标题

    lines = text.split('\n')
+
+    def check_year_pattern(line):
+        # 检查是否为年份模式，如 '2014年'
+        line_no_spaces = line.replace(' ', '')
+        return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
+
    for i, line in enumerate(lines):
        line_stripped = line.strip().replace('．', '.')
+
+        # 如果在一个章节内，跳过年份模式的行
+        if check_year_pattern(line_stripped) and current_key is not None:
+            current_content.append(line_stripped)
+            continue
+
+        # 匹配带数字的标题，例如 '12.1 内容'
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)

-        # 新增：处理以点号开头的情况
-        dot_match = re.match(r'^\.(\d+(?:\.\d+)*)\s*(.+)$', line_stripped)
+        # 匹配以点号开头并带有数字的情况，例如 '.12.1 内容'
+        dot_match = re.match(r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$', line_stripped)

-        # 新增：处理没有点号的纯数字开头的情况
+        # 匹配以点号开头但不带数字的情况，例如 '.投标文件的制作和签署'
+        dot_text_match = re.match(r'^[．.]\s*(\D.+)$', line_stripped)
+
+        # 匹配不带点号的纯数字开头的情况，例如 '27xxxxx'
        pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)

        if match:
@ -160,6 +178,14 @@ def parse_text_by_heading(text):
                data[current_key_chinese] = current_value_chinese
                current_key_chinese = None

+            # 处理临时标题与新主标题的关联
+            if temp_title:
+                # 提取主编号（第一个点前的数字）
+                main_number = new_key.split('.')[0]
+                # 关联临时标题到主编号
+                data[f"{main_number}."] = temp_title
+                temp_title = None  # 重置临时标题
+
            # 保存阿拉伯数字的标题内容
            if current_key is None or (compare_headings(current_key, new_key) and (
                    len(current_content) == 0 or current_content[-1][-1] != '第')):
@ -170,20 +196,13 @@ def parse_text_by_heading(text):
                current_content = [line_content]
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                last_main_number = new_key.split('.')[0]
-            # if current_key is None or (current_key != new_key and (     #不给序号排序
-            #         len(current_content) == 0 or current_content[-1][-1] != '第')):
-            #     if current_key is not None:
-            #         content_string = ''.join(current_content).strip()
-            #         data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
-            #     current_key = new_key
-            #     current_content = [line_content]
-            #     append_newline = len(new_key.rstrip('.').split('.')) <= 2
-            #     last_main_number = new_key.split('.')[0]
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+
        elif dot_match:
-            # 处理以点号开头的情况
+            # 处理以点号开头并带有数字的情况
            sub_number, line_content = dot_match.groups()
+            sub_number = sub_number.replace('．', '.')
            if last_main_number:
                new_key = f"{last_main_number}.{sub_number}"
                if current_key is not None:
@ -194,7 +213,13 @@ def parse_text_by_heading(text):
                append_newline = True
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
-        elif pure_number_match:  # 处理没有点号的纯数字开头的情况'27xxxxx'
+
+        elif dot_text_match:
+            # 处理以点号开头但不带数字的情况，存储到临时变量
+            temp_title = dot_text_match.group(1).strip()
+            continue  # 跳过进一步处理该行
+
+        elif pure_number_match:  # 处理不带点号的纯数字开头的情况，例如 '27xxxxx'
            new_key_candidate, line_content = pure_number_match.groups()
            new_key_candidate += '.'  # 添加点号
            line_content = line_content.lstrip('.．、,')
@ -208,6 +233,12 @@ def parse_text_by_heading(text):

            # 判断新键名是否为新的标题，条件是键名递增且差值在5以内
            if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
+                # 处理临时标题与新主标题的关联
+                if temp_title:
+                    main_number = new_key_candidate.split('.')[0]
+                    data[f"{main_number}."] = temp_title
+                    temp_title = None  # 重置临时标题
+
                # 开始新的标题
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
@ -216,19 +247,28 @@ def parse_text_by_heading(text):
                current_content = [line_content]
                append_newline = True
                last_main_number = new_key_candidate.rstrip('.')
+                # if current_key is None or (current_key != new_key and (     #不给序号排序
+                #         len(current_content) == 0 or current_content[-1][-1] != '第')):
+                #     if current_key is not None:
+                #         content_string = ''.join(current_content).strip()
+                #         data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+                #     current_key = new_key
+                #     current_content = [line_content]
+                #     append_newline = len(new_key.rstrip('.').split('.')) <= 2
+                #     last_main_number = new_key.split('.')[0]
            else:
                # 将当前行视为当前标题的内容
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)

        else:
-            if not skip_subheadings:
-                # 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑
-                pattern_title = re.compile(r'^\s*(?:[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
+            if not skip_subheadings:   # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑
+                pattern_title = re.compile(
+                    r'^\s*(?:[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
                chinese_match = pattern_title.match(line_stripped)
                letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
                arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)

-                # 优化了处理逻辑，减少冗余
+                # 优化处理逻辑，减少冗余
                if chinese_match or letter_match or arabic_match:
                    # 保存之前的 key 的内容（无论是中文标题还是阿拉伯数字标题）
                    if current_key is not None:
@ -277,6 +317,9 @@ def parse_text_by_heading(text):
        data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
    if current_key_chinese is not None:
        data[current_key_chinese] = current_value_chinese
+    if temp_title:
+        # 处理任何未关联的临时标题
+        data["未分类标题"] = temp_title  # 您可以根据需要选择合适的键名

    return data

@ -364,13 +407,13 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
        print(f"The specified file does not exist: {file_path}")
        return ""
    if type == 1:
-        start_word = r'^\s*[（(]?\s*[一1]\s*[)）]?\s*[、.]*\s*(说\s*明|总\s*则)'
-        end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
+        start_word = r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则)'
+        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
        text = extract_text_from_pdf(file_path, start_word, end_pattern)
        result = parse_text_by_heading(text)
    else:
-        start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
-        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
+        start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*)$'
+        end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
        text = extract_text_from_pdf(file_path, start_word, end_pattern)
        result=parse_text_to_dict(text)
    # result = convert_to_json(input_path, start_word, end_pattern)
@ -407,9 +450,9 @@ def process_folder(input_folder, output_folder):
 #TODO: 投标人须知正文这块，序号可能是乱序的，或许可以删除判断序号大小的逻辑，只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf'
+    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
-    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp'
+    output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder,1)
        print(f"Final JSON result saved to: {output_path}")
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -129,22 +129,26 @@ def fetch_invalid_requirements(invalid_docpath, output_folder):
    logger.info(f"无效标与废标 done，耗时：{end_time - start_time:.2f} 秒")
    return find_invalid_res

-def fetch_bidding_documents_requirements(invalid_path,clause_path):
+def fetch_bidding_documents_requirements(invalid_path,merged_baseinfo_path,clause_path):
    logger.info("starting 投标文件要求...")
+    if not merged_baseinfo_path:
+        merged_baseinfo_path=invalid_path
    start_time = time.time()
    selection=1
-    fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection)
+    fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
    end_time = time.time()
    logger.info(f"投标文件要求 done，耗时：{end_time - start_time:.2f} 秒")
    return {"投标文件要求": fetch_bidding_documents_requirements_json}


 # 开评定标流程
-def fetch_bid_opening(invalid_path,clause_path):
+def fetch_bid_opening(invalid_path,merged_baseinfo_path,clause_path):
    logger.info("starting 开评定标流程...")
+    if not merged_baseinfo_path:
+        merged_baseinfo_path=invalid_path
    start_time = time.time()
    selection=2
-    fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection)
+    fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
    end_time = time.time()
    logger.info(f"开评定标流程 done，耗时：{end_time - start_time:.2f} 秒")
    return {"开评定标流程": fetch_bid_opening_json}
@ -201,9 +205,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
                                                    processed_data['evaluation_method_path']),
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder),
-            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],
+            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
                                                              processed_data['clause_path']),
-            'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']),
+            'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path']),
            'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
                                         processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']),
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder,