12.23 无效标废标更新

2024-12-23 15:47:41 +08:00 · 2024-12-23 15:47:41 +08:00 · f96bac58ba
commit f96bac58ba
parent f59c0428b4
17 changed files with 264 additions and 75 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -58,7 +58,7 @@ def download_file(url, local_filename):
                    # 转换扫描型 PDF -> DOCX -> 普通 PDF
                    intermediate_docx = pdf2docx(full_filename)
                    if intermediate_docx:
-                        normal_pdf = docx2pdf(intermediate_docx)
+                        normal_pdf = docx2pdf(intermediate_docx, force=True)
                        if normal_pdf:
                            # 替换原始 PDF 文件
                            os.replace(normal_pdf, full_filename)
@ -155,7 +155,18 @@ def doc2docx(local_path_in):
    downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
    print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
    return downloaded_filepath
-def docx2pdf(local_path_in):
+def docx2pdf(local_path_in,force=False):
+    """
+        将 DOCX 文件转换为 PDF。
+
+        参数:
+        - local_path_in (str): 输入的 DOCX 文件路径。
+        - force (bool): 是否强制转换并覆盖已存在的 PDF 文件。默认为 False。
+
+        返回:
+        - str: 转换后的 PDF 文件路径
+        - "" 如果转换失败
+        """
    if not local_path_in:
        return ""
    # 获取文件名和所在文件夹
@ -163,8 +174,11 @@ def docx2pdf(local_path_in):
    # 检查是否已经存在同名的 .pdf 文件
    pdf_file_path = os.path.join(folder, f"{filename}.pdf")
    if os.path.exists(pdf_file_path):
-        print(f"Skipping conversion, {pdf_file_path} already exists.")
-        return pdf_file_path
+        if force:
+            print(f"强制转换，覆盖已存在的文件: {pdf_file_path}")
+        else:
+            print(f"跳过转换，文件已存在: {pdf_file_path}")
+            return pdf_file_path  # 跳过转换
    remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
    receive_download_url = upload_file(local_path_in, remote_url)
    filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
@ -295,8 +309,8 @@ if __name__ == '__main__':
    # # downloaded_file=docx2pdf(local_path_in)
    # print(downloaded_file)

-    test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%20-%20%E5%89%AF%E6%9C%AC.PDF?Expires=1733478585&OSSAccessKeyId=TMP.3KhfwZc3kpT9TUmsb46yBDdnRq8bbENcEWBbZP8nLMgmSjVkjg9edpTPUQUsH8VXtvvg839Xbm8N5paYxPKvxCGqx3Vx4m&Signature=RYOo7tMEyahaMA3cSsf2kkf8co8%3D"
-    local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
+    test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/2022-%E5%B9%BF%E4%B8%9C-%E9%B9%8F%E5%8D%8E%E5%9F%BA%E9%87%91%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%E6%B7%B1%E5%9C%B3%E6%B7%B1%E4%B8%9A%E4%B8%8A%E5%9F%8E%E5%8A%9E%E5%85%AC%E5%AE%A4%E8%A3%85%E4%BF%AE%E9%A1%B9%E7%9B%AE.pdf?Expires=1734952142&OSSAccessKeyId=TMP.3KiE75LGW8c68AXJaPpYRFnZXWrLq6zszWkdUCghFWLphdM9YvAMwoCNofeTSYLTBAU3TebtNuwubFH7s3qgTFhCs7q98b&Signature=NnSiQaqhznJ33Q6DhfsxATUa1ls%3D"
+    local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
    downloaded = download_file(test_url, local_file_name)
    if not downloaded:
        print("下载文件失败或不支持的文件类型")
--- a/flask_app/general/test_doubao.py
+++ b/flask_app/general/test_doubao.py
@ -0,0 +1,178 @@
+import os
+import time
+
+import requests
+from ratelimit import sleep_and_retry, limits
+def read_txt_to_string(file_path):
+    """
+    读取txt文件内容并返回一个包含所有内容的字符串，保持原有格式。
+
+    参数:
+    - file_path (str): txt文件的路径
+
+    返回:
+    - str: 包含文件内容的字符串
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:  # 确保使用适当的编码
+            content = file.read()  # 使用 read() 保持文件格式
+        return content
+    except FileNotFoundError:
+        return "错误：文件未找到。"
+    except Exception as e:
+        return f"错误：读取文件时发生错误。详细信息：{e}"
+def generate_full_user_query(file_path, prompt_template):
+    """
+    根据文件路径和提示词模板生成完整的user_query。
+
+    参数：
+    - file_path (str): 需要解析的文件路径。
+    - prompt_template (str): 包含{full_text}占位符的提示词模板。
+
+    返回：
+    - str: 完整的user_query。
+    """
+    # 假设extract_text_by_page已经定义，用于提取文件内容
+    full_text=read_txt_to_string(file_path)
+    # 格式化提示词，将提取的文件内容插入到模板中
+    user_query = prompt_template.format(full_text=full_text)
+
+    return user_query
+def get_total_tokens(text):
+    """
+    调用 API 计算给定文本的总 Token 数量。
+
+    参数：
+    - text (str): 需要计算 Token 的文本。
+    - model (str): 使用的模型名称，默认值为 "ep-20241119121710-425g6"。
+
+    返回：
+    - int: 文本的 total_tokens 数量。
+    """
+    # API 请求 URL
+    url = "https://ark.cn-beijing.volces.com/api/v3/tokenization"
+
+    # 获取 API 密钥
+    doubao_api_key = os.getenv("DOUBAO_API_KEY")
+    if not doubao_api_key:
+        raise ValueError("DOUBAO_API_KEY 环境变量未设置")
+
+    # 请求头
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer " + doubao_api_key
+    }
+    model = "ep-20241119121710-425g6"
+    # 请求体
+    payload = {
+        "model": model,
+        "text": [text]  # API 文档中要求 text 是一个列表
+    }
+
+    try:
+        response = requests.post(url, headers=headers, json=payload)
+        response.raise_for_status()
+        response_data = response.json()
+        total_tokens=response_data["data"][0]["total_tokens"]
+        return total_tokens
+    except Exception as e:
+        print(f"获取 Token 数量失败：{e}")
+        return 0
+
+@sleep_and_retry
+@limits(calls=10, period=1)  # 每秒最多调用10次
+def doubao_model(full_user_query, need_extra=False):
+    print("call doubao...")
+    # 相关参数
+    url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
+    doubao_api_key = os.getenv("DOUBAO_API_KEY")
+
+    # 定义主模型和备用模型
+    models = {
+        "pro_32k": "ep-20241119121710-425g6",  # 豆包Pro 32k模型
+        "pro_128k": "ep-20241119121743-xt6wg"  # 128k模型
+    }
+
+    # 判断用户查询字符串的长度
+    token_count = get_total_tokens(full_user_query)
+    if token_count > 31500:
+        selected_model = models["pro_128k"]  # 如果长度超过32k，直接使用128k模型
+    else:
+        selected_model = models["pro_32k"]  # 默认使用32k模型
+
+    # 请求头
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer " + doubao_api_key
+    }
+
+    max_retries_429 = 2  # 针对 429 错误的最大重试次数
+    max_retries_other = 1  # 针对其他错误的最大重试次数
+    attempt = 0
+    response = None  # 确保 response 被定义
+
+    while True:
+        # 请求数据
+        data = {
+            "model": selected_model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": full_user_query
+                }
+            ],
+            "temperature": 0.2
+        }
+        try:
+            response = requests.post(url, headers=headers, json=data)  # 设置超时时间为10秒
+            response.raise_for_status()  # 如果响应状态码不是200，将引发HTTPError
+
+            # 获取响应 JSON
+            response_json = response.json()
+
+            # 获取返回内容
+            content = response_json["choices"][0]["message"]["content"]
+
+            # 获取 completion_tokens
+            completion_tokens = response_json["usage"].get("completion_tokens", 0)
+
+            # 根据 need_extra 返回不同的结果
+            if need_extra:
+                return content, completion_tokens
+            else:
+                return content
+
+        except requests.exceptions.RequestException as e:
+            # 获取状态码并处理不同的重试逻辑
+            status_code = response.status_code if response is not None else None
+            print(f"请求失败，状态码: {status_code}")
+            print("请求失败，完整的响应内容如下：")
+            if response is not None:
+                print(response.text)  # 打印原始的响应内容，可能是 JSON 格式，也可能是其他格式
+
+            # 如果是 429 错误
+            if status_code == 429:
+                if attempt < max_retries_429:
+                    wait_time = 2 if attempt == 0 else 4
+                    print(f"状态码为 429，等待 {wait_time} 秒后重试...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"状态码为 429，已达到最大重试次数 {max_retries_429} 次。")
+                    break  # 超过最大重试次数，退出循环
+            else:
+                # 针对其他错误
+                if attempt < max_retries_other:
+                    print("非 429 错误，等待 1 秒后重试...")
+                    time.sleep(1)
+                else:
+                    print(f"非 429 错误，已达到最大重试次数 {max_retries_other} 次。")
+                    break  # 超过最大重试次数，退出循环
+
+            attempt += 1  # 增加重试计数
+
+    # 如果到这里，说明所有尝试都失败了
+    print(f"请求失败，已达到最大重试次数。")
+    if need_extra:
+        return None, 0
+    else:
+        return None
--- a/flask_app/general/截取pdf_main.py
+++ b/flask_app/general/截取pdf_main.py
@ -28,8 +28,8 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
    def handle_exception(selection):
        return ["", ""] if selection == 4 else [""]
    modes_config = {
-        "goods": {"selections": [1, 2, 3, 4, 5, 6], "truncate_func": truncate_pdf_main_goods},
-        "engineering": {"selections": [1, 2, 3, 4, 5], "truncate_func": truncate_pdf_main_engineering},
+        "goods": {"selections": [1, 2, 3, 4, 5], "invalid_selection": 6, "truncate_func": truncate_pdf_main_goods},
+        "engineering": {"selections": [1, 2, 3, 4],"invalid_selection": 5,  "truncate_func": truncate_pdf_main_engineering},
    }

    # 验证 mode 是否有效
@ -40,9 +40,10 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
    config = modes_config[mode]
    truncate_function = config["truncate_func"]
    selections = selections or config["selections"]
-
+    invalid_selection=config["invalid_selection"]
+    invalid_path=truncate_function(pdf_path,output_folder,invalid_selection,logger)[0]
    # 检查 PDF 页数逻辑
-    skip, empty_return = check_pdf_pages(pdf_path, mode, logger)
+    skip, empty_return = check_pdf_pages(invalid_path, mode, logger)
    if skip:
        return empty_return

@ -92,6 +93,9 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
        base_file_name,
        mode=mode
    )
+    if invalid_path:
+        truncate_files.append(invalid_path)
+        logger.info(f"已添加 invalid_path: {invalid_path}")

    if merged_path:
        # 合并成功，添加合并后的文件路径
@ -110,17 +114,18 @@ if __name__ == "__main__":
    start_time = time.time()
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
-    pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
+    # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
-    # pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件（实高电子显示屏）.pdf'
+    pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件（实高电子显示屏）.pdf'
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
    # selections = [1, 4]  # 仅处理 selection 4、1
-    selections=[6]
-    #engineering
-    # files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
-    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
+    selections = [1, 3, 5]
+    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)
+    # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
    print(files)
+    # print(files[-1])
+    # print(files[-2])
    # selection = 1  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
    # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
    # print(generated_files)
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -40,21 +40,21 @@ def check_pdf_pages(pdf_path,mode, logger):
        reader = PdfReader(pdf_path)
        num_pages = len(reader.pages)
        logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
-        if num_pages <= 50:
+        if num_pages <= 30:
            logger.info("PDF页数小于或等于50页，跳过切分逻辑。")
            if mode=='goods':
-                return True,['', '', '', '', '', '', '','']
+                return True,['', '', '', '', '', '', pdf_path,'']
            else:
-                return True,['', '', '', '', '', '', '']
+                return True,['', '', '', '', '', pdf_path, '']
        # 若页数大于50页，返回None表示继续处理
        return False, []
    except Exception as e:
        logger.error(f"无法读取 PDF 页数: {e}")
        # 返回空列表意味着无法执行后续处理逻辑
        if mode == 'goods':
-            return True,['', '', '', '', '', '', '', '']
+            return True,['', '', '', '', '', '', pdf_path, '']
        else:
-            return True,['', '', '', '', '', '', '']
+            return True,['', '', '', '', '', pdf_path, '']


 def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -365,9 +365,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
            -使用嵌套键值对。
                -嵌套键名应为原文中的具体标题或对相关子要求的简明总结。
                -最内层的键值应与原文内容保持一致，不得进行任何总结、删减或改写。默认键值是单独的字符串，如果一个子要求包含多个并列内容，键值应为一个字符串列表（数组），其中每个元素都是子要求内容。
+        - 特别限制：
+            - 若文件中有类似“投标文件格式要求”的小节，禁止输出原文中的表格格式示例，请仅提取并描述具体的文字部分的格式要求，而不是重现表格内容；若无类似小节，请忽略这点，也无需返回该键值对。
    表格内容处理：
        - 如果原文中对应内容以表格形式呈现，请使用 Markdown 语法准确重现该表格。
-        - 表格的每一行应作为键值中的一个独立字符串，保持表格结构和内容的完整性。
+        - 表格的每一行应作为键值（字符串列表）中的一个独立字符串，保持表格结构和内容的完整性。
    禁止内容：
        - 确保所有输出内容均基于提供的实际招标文件内容，不使用任何预设的示例作为回答。
        - 预设的示例中的外层键名仅供格式参考，以文中实际内容为主。
@ -476,7 +478,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
        return {"error": "调用大模型失败"}

 if __name__ == "__main__":
-    merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\ce279982-aeeb-4f08-ab39-df6ee2732eae\ztbfile.pdf"
+    merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\eabefc28-142f-4bb5-b1be-e86e43bb87b5\invalid_del.docx"
    selection=1
    res=get_requirements_with_gpt(merged_baseinfo_path,selection)
    print(json.dumps(res,ensure_ascii=False,indent=4))
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -659,7 +659,7 @@ if __name__ == '__main__':
    output_dir = r"C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\tmp"
    # invalid_added=insert_mark(pdf_path)
    # invalid_added_docx=pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\invalid_added.docx'
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\953e3722-f49e-4f2f-b513-513b75894701\invalid_added.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -118,7 +118,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):

 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件.pdf"
+    file_path=r"D:\flask_project\flask_app\static\output\output1\1571dc96-4bb3-4ab2-a45a-993ed82678e8\ztbfile.pdf"
    # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
--- a/flask_app/general/通用功能函数.py
+++ b/flask_app/general/通用功能函数.py
@ -15,14 +15,13 @@ def process_judge_questions(judge_file_path, chosen_numbers, file_id, baseinfo_l
            "外层键名为'联合体投标要求'，其中有一个嵌套键值对为：\"是否接受联合体投标\":\"是\""
        )
        judge_questions.append(judge_consortium_question)
+    if not judge_questions:
+        print("process_judge_questions:没有需要处理的 judge_questions，跳过 multi_threading 调用。")
+        return  # 或者根据需要返回其他值
    # file_id3 = upload_file(merged_baseinfo_path)
    res2 = multi_threading(judge_questions, "", file_id, 2)
-
-    if not res2:
-        print("基础信息整合： multi_threading error!")
-    else:
-        for question, response in res2:
-            baseinfo_list1.append(clean_json_string(response))
+    for question, response in res2:
+        baseinfo_list1.append(clean_json_string(response))

 def judge_consortium_bidding(baseinfo_list):
    updated_list = []
--- a/flask_app/routes/偏离表main.py
+++ b/flask_app/routes/偏离表main.py
@ -360,8 +360,9 @@ def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder)
    notice_path=files[0]
    qualification_file=files[1]
    procurement_file=files[2]
+    invalid_path=files[-2]
    # invalid_path=docx_path
-    invalid_path=docx_path if docx_path != "" else pdf_path  #可能是pdf docx
+    invalid_path=docx_path if docx_path != "" else invalid_path  #可能是pdf docx
    if not procurement_file:
        procurement_file=invalid_path

--- a/flask_app/routes/小解析main.py
+++ b/flask_app/routes/小解析main.py
@ -11,9 +11,10 @@ from flask_app.general.通义千问long import upload_file
 from flask_app.general.通用功能函数 import get_global_logger
 from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
 from flask_app.general.截取pdf_main import truncate_pdf_multiple
+from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
+from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
 from flask_app.general.post_processing import inner_post_processing
 from flask_app.工程标.基础信息整合工程标 import aggregate_basic_info_engineering
-
 #货物标
 def little_parse_goods(output_folder, pdf_path,logger):
    """
@ -27,14 +28,14 @@ def little_parse_goods(output_folder, pdf_path,logger):
        dict: 包含 '基础信息' 的字典。
    """
    # 截取特定的货物 PDF 文件
-    selections = [1,4]  # 仅处理 selection 1和4  #公告+投标人须知
+    selections = [1,4,6]  # 仅处理 selection 1和4  #公告+投标人须知
    files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods',selections)
    if not files:
        raise ValueError("未找到截取后的文件。")
    # 假设最后一个文件是需要处理的基础信息文件
    baseinfo_file_path = files[-1]
    if not baseinfo_file_path:
-        baseinfo_file_path=pdf_path     #截取失败就传整份文件
+        baseinfo_file_path=files[-2]
    # 上传文件并获取文件 ID
    file_id = upload_file(baseinfo_file_path)
    # 注意：以下路径被硬编码，确保该路径存在并且正确
@ -70,8 +71,7 @@ def little_parse_engineering(output_folder, pdf_path,logger):
    # 假设最后一个文件是需要处理的基础信息文件
    baseinfo_file_path = files[-1]
    if not baseinfo_file_path:
-        selections=[5]
-        baseinfo_file_path=truncate_pdf_multiple(pdf_path,output_folder,logger,selections)[0]  #invalid_path
+        baseinfo_file_path=files[-2]
    # 上传文件并获取文件 ID
    file_id = upload_file(baseinfo_file_path)
    # 注意：以下路径被硬编码，确保该路径存在并且正确
--- a/flask_app/test_case/test_正则表达式2.py
+++ b/flask_app/test_case/test_正则表达式2.py
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -198,22 +198,25 @@ def generate_template(required_keys,full_text, type=1):
    def generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type):
        if type == 1:
            specific_instructions = textwrap.dedent(
-                """4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外，还有带三角▲或五角星★的描述内容（如工期要求、进度要求、品牌要求等商务要求），也请将该部分内容提取出来，添加在外层键名为'商务要求'的键值部分；若存在标题包含'工期要求'、'进度要求'等和商务要求有关的关键字，也请将该标题下的内容提取，添加在外层键名为'商务要求'的键值部分。请不要遗漏这部分的'商务要求'。
-    5. 在提取'服务要求'的时候，若原文（包含正文和表格）中存在'安装要求'、'售后要求'、'维护要求'、'培训要求'等服务相关的要求说明，请添加至'服务要求'的键值部分，不要遗漏这部分的'服务要求'。
+                """5. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外，还有带三角▲或五角星★的描述内容（如工期要求、进度要求、品牌要求等商务要求），也请将该部分内容提取出来，添加在外层键名为'商务要求'的键值部分；若存在标题包含'工期要求'、'进度要求'等和商务要求有关的关键字，可以使用嵌套键值对形式，将这些内容添加到“商务要求”的键值部分，请不要遗漏这部分的'商务要求'。
+    6. 在提取'服务要求'的时候，若原文（包含正文和表格）中存在'安装要求'、'售后要求'、'维护要求'、'培训要求'等服务相关的要求说明，可以使用嵌套键值对形式，添加至'服务要求'的键值部分，不要遗漏这部分的'服务要求'。
+    **限制内容**：
+        - **避免提取技术要求**：在提取这些要求时，确保不包含任何与技术规格、功能或性能相关的内容。
                """
            )
        else:
            specific_instructions = textwrap.dedent(
-                """4. 在提取技术要求或技术、服务要求时，你无需从采购清单或表格中提取技术要求以及参数要求，你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容，从正文中提取；若技术要求或技术服务要求的内容全在表格中，键值为空列表[]。
-    5. 在提取'技术要求'时，注意不要提取有关'安装、售后、维护、运维、培训、质保'等要求，它们不属于'技术要求'，但是若大标题中包含'总体要求''建设要求'等和技术要求相关的关键字，请添加到'技术要求'的键值部分。
+                """5. 在提取技术要求或技术、服务要求时，你无需从采购清单或表格中提取具体设备、采购标的的技术要求以及参数要求，你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题，并提取该标题下的整体技术要求内容；若该章节的技术要求内容全部以表格形式呈现，则键值应为空列表[]。
+    6. 在提取'技术要求'时，注意不要提取有关'安装、售后、维护、运维、培训、质保、工期、进度'等要求，它们不属于'技术要求'，但是若大标题中包含'总体要求''建设要求'等和技术要求相关的关键字，可以使用嵌套键值对形式，将这些内容添加到“技术要求”的键值部分。
                """
            )
        return textwrap.dedent(
-            f"""请你根据该货物类招标文件中的采购要求部分内容，请告诉我该项目采购的{keys_str}分别是什么，请以json格式返回结果，默认情况下外层键名是{outer_keys_str}，键值为字符串列表，每个字符串表示具体的一条要求，可以按原文中的序号作划分（若有序号的话），请按原文内容回答，保留三角▲、五角星★和序号（若有），不要擅自增删内容。
+            f"""请你根据该货物类招标文件中的采购要求部分内容，请告诉我该项目采购的{keys_str}分别是什么，请以json格式返回结果，外层键名是{outer_keys_str}，默认情况下键值为字符串列表，每个字符串表示具体的一条要求，可以按原文中的序号作划分（若有序号的话），请按原文内容回答，保留三角▲、五角星★和序号（若有），不要擅自增删内容。
    
    注意事项：
-    1. 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求，可以将它忽略而不是将它与下文具体要求进行多行合并，或者作为该要求下的嵌套键名，总之字符串列表中只提取具体的要求。
-    2. 请不要提取{another_keys_str}中的内容。
+    1. 提取的要求应为采购、招标活动或项目的整体要求，而非针对具体采购物品的技术参数或功能要求。
+    2. 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求，可以将它忽略而不是将它与下文具体要求进行多行合并，或者作为该要求下的嵌套键名，总之字符串列表中只提取具体的要求。
+    3. 请不要提取{another_keys_str}中的内容。

    要求与指南：
    1. JSON 的结构要求：
@ -225,8 +228,8 @@ def generate_template(required_keys,full_text, type=1):
        - 最多只允许一层嵌套。
    2. 请优先且准确定位正文部分包含以下关键字的标题：{outer_keys_str}，在其之后提取'XX要求'相关内容，尽量避免在无关地方提取内容。
    3. 注意请不要返回Markdown表格语法，必要时使用冒号':'将相关信息拼接在一起 
+    4. 字符串列表中的每个字符串内容需与原文内容保持一致，保留前面的三角▲、五角星★和序号（如果有），但不可以擅自添加这些内容。
    {specific_instructions}
-    6. 字符串列表中的每个字符串内容需与原文内容保持一致，保留前面的三角▲、五角星★和序号（如果有），但不可以擅自添加这些内容。
    """)

    # 过滤示例内容
--- a/flask_app/货物标/基础信息解析main.py
+++ b/flask_app/货物标/基础信息解析main.py
@ -134,8 +134,8 @@ def dynamic_key_handling(key_groups, detected_keys):

 def get_base_info(merged_baseinfo_path,clause_path):
    file_id = upload_file(merged_baseinfo_path)
-    baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
-    # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
+    # baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
+    baseinfo_file_path = r'D:\flask_project\flask_app\static\提示词\基本信息货物标.txt'
    questions = read_questions_from_file(baseinfo_file_path)
    more_query = "请你根据招标文件信息，回答以下问题：是否组织踏勘现场？是否召开投标预备会（或投标答疑会）？是否退还投标文件？是否允许分包? 是否需要递交投标保证金（或磋商保证金）？是否需要提交履约保证金（或履约担保）？是否有招标代理服务费（或中标、成交服务费或采购代理服务费）？请按json格式给我提供信息，键名分别为'是否组织踏勘现场','是否召开投标预备会'（或'是否召开投标答疑会'）,'是否退还投标文件',是否允许分包','是否递交投标保证金'（或'是否递交磋商保证金'）,'是否提交履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在矛盾信息，请回答'未知'。"
    questions.append(more_query)
@ -143,8 +143,8 @@ def get_base_info(merged_baseinfo_path,clause_path):
    baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
    chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop())
    baseinfo_list.append(merged)
-    judge_file_path = 'flask_app/static/提示词/是否相关问题货物标.txt'
-    # judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
+    # judge_file_path = 'flask_app/static/提示词/是否相关问题货物标.txt'
+    judge_file_path =r'D:\flask_project\flask_app\static\提示词\是否相关问题货物标.txt'
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        # 提交两个任务
        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, file_id,
@ -156,24 +156,6 @@ def get_base_info(merged_baseinfo_path,clause_path):
        rebidding_situation = future2.result()
    update_json = add_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
    baseinfo_list.append(update_json)
-
-    # # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
-    # judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
-    # # print(judge_questions)
-    # judge_consortium = judge_consortium_bidding(baseinfo_list)  # 通过招标公告判断是否接受联合体投标
-    #
-    # if judge_consortium:
-    #     judge_consortium_question = (
-    #         "该招标文件对于联合体投标的要求是怎样的，请按json格式给我提供信息，"
-    #         "外层键名为'联合体投标要求'，其中有一个嵌套键值对为：\"是否接受联合体投标\":\"是\""
-    #     )
-    #     judge_questions.append(judge_consortium_question)
-    # res2 = multi_threading(judge_questions, "", file_id, 2)  # 调用千问-long
-    # if not res2:
-    #     print("基础信息整合： multi_threading error!")
-    # else:
-    #     for question, response in res2:
-    #         baseinfo_list.append(clean_json_string(response))
    return baseinfo_list

 def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invalid_path):
@ -209,11 +191,12 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invali
 if __name__ == "__main__":
    start_time=time.time()
    # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
-    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx"
+    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\eabefc28-142f-4bb5-b1be-e86e43bb87b5\invalid_del.docx"
    # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
    procurement_file_path = r"D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx"
    clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
-    res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
+    # res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
+    res=get_base_info(merged_baseinfo_path,"")
    print("------------------------------------")
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -611,9 +611,10 @@ if __name__ == "__main__":
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
    # pdf_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
+    pdf_path=r"D:\flask_project\flask_app\static\output\output1\1571dc96-4bb3-4ab2-a45a-993ed82678e8\ztbfile.pdf"
    output_folder = r"C:\Users\Administrator\Desktop\货物标\output33"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    selection = 3  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
+    selection = 6  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
+    print(type(generated_files))
    print(generated_files)
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -391,6 +391,9 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
    key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data)  # 提取需要采购的货物清单 key_list：交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2'，提取'交换机'  ，输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
    modified_data=rename_keys(data_copy)
    user_query_template = """请根据货物标中采购要求部分的内容，告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果，键名为\"{}\"，键值为一个列表，列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串，请按原文内容回答，保留三角▲、五角★和序号，不可擅自增删内容，尤其是不可擅自添加序号。
+**重要限制**：
+- **仅提取技术参数或采购要求，不包括任何商务要求**。商务要求通常涉及供应商资格、报价条款、交货时间、质保等内容，是整体的要求；而技术参数或采购要求则具体描述产品的技术规格、功能、性能指标等。
+- **商务要求的关键词示例**（仅供参考，不限于此）：报价、交货、合同、资质、认证、服务、保修期等。如果内容包含上述关键词，请仔细甄别是否属于商务要求。

 要求与指南：
 1. 你的键值应该全面，不要遗漏。
--- a/flask_app/货物标/提取json货物标版.py
+++ b/flask_app/货物标/提取json货物标版.py
@ -131,7 +131,7 @@ def clean_content(content):
 #TODO:这里的start_word end_pattern可以优化
 def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
    if not os.path.exists(file_path):
-        print(f"The specified file does not exist: {file_path}")
+        print(f"The specified file does not exist: 返回空的clause_path")
        return ""
    if type == 1:
        start_word = r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
--- a/flask_app/货物标/资格审查main.py
+++ b/flask_app/货物标/资格审查main.py
@ -680,14 +680,14 @@ if __name__ == "__main__":
    output_folder=r"D:\flask_project\flask_app\static\output\output1\c911b0f8-0ff4-4718-80e3-86f464f313d3"
    # qualification_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_qualification1.pdf"
    # qualification_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_qualification2.pdf"
-    qualification_path=r"C:\Users\Administrator\Desktop\货物标\output3\招标文件（定稿）_qualification1.pdf"
+    qualification_path=r""
    # notice_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_notice.pdf"
    # notice_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_notice.pdf"
-    notice_path=r"C:\Users\Administrator\Desktop\new招标文件\货物标\tmp1\HBDL-2024-0362-001-招标文件_notice.pdf"
+    notice_path=r"D:\flask_project\flask_app\static\output\output1\1571dc96-4bb3-4ab2-a45a-993ed82678e8\ztbfile_notice.pdf"
    # knowledge_name = "6.2视频会议docx"
    # invalid_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
    # invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile.pdf"
-    invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b\ztbfile_invalid.pdf"
+    invalid_path=r"D:\flask_project\flask_app\static\output\output1\1571dc96-4bb3-4ab2-a45a-993ed82678e8\invalid_del.docx"
    res = combine_qualification_review(invalid_path, qualification_path, notice_path)
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()