11.15 工程标资格审查提示词重改

2024-11-15 14:43:10 +08:00 · 2024-11-15 14:43:10 +08:00 · dc91f1d9b5
commit dc91f1d9b5
parent 31731ea5b5
15 changed files with 35 additions and 28 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -176,7 +176,6 @@ def docx2pdf(local_path_in):
 #     return output_path
 #TODO:6.2定版视频会议磋商文件.doc文件转换有问题
 if __name__ == '__main__':
    # 替换为你的文件路径和API URL
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -185,6 +185,8 @@ def process_nested_data(data):
 #生成无结构的数据货物标
 def concatenate_keys_values(section_content):
    print("-------------")
    print(json.dumps(section_content, ensure_ascii=False, indent=4))
    """
    将章节内容的键值对拼接成一个字符串列表，每个元素为 "key value"。
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -574,7 +574,6 @@ def combine_find_invalid(file_path, output_dir):
    print("无效标与废标done...")
    return {"无效标与废标项": combined_dict}
 # TODO:无效标目前以整个docx文档作为输入，可能导致后面两章不必要的信息也导入。 无效投标至少>8个字
 if __name__ == '__main__':
    start_time = time.time()
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -3,8 +3,6 @@ from flask_app.general.clean_pdf import extract_common_header, clean_page_conten
 def extract_text_by_page(file_path):
    common_header = extract_common_header(file_path)
    print(f"公共抬头：{common_header}")
    print("--------------------正文开始-------------------")
    result = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
@ -14,7 +12,6 @@ def extract_text_by_page(file_path):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
                print("-------------------")
                cleaned_text = clean_page_content(text,common_header)
                print(cleaned_text)
                result += cleaned_text
@ -96,7 +93,7 @@ def extract_text_by_page(file_path):
 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    file_path =r"C:\Users\Administrator\Desktop\货物标\output4_2\招标文件111_tobidders_notice_part2.pdf"
+    file_path=r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-11\17F7BF97-1A4D-427D-81F2-5C9AD4B097DB_DF07_Flatten\17F7BF97-1A4D-427D-81F2-5C9AD4B097DB_DF07_Flatten_1-524.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
--- a/flask_app/main/基础信息整合快速版.py
+++ b/flask_app/main/基础信息整合快速版.py
@ -4,7 +4,7 @@ import time
 import concurrent.futures
 from flask_app.general.json_utils import clean_json_string, rename_outer_key
 from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 from flask_app.main.判断是否分包等 import read_questions_from_judge, merge_json_to_list
 from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.general.通义千问long import upload_file,qianwen_long
--- a/flask_app/main/工程标解析main.py
+++ b/flask_app/main/工程标解析main.py
@ -12,7 +12,7 @@ from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.general.无效标和废标公共代码 import combine_find_invalid
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.main.基础信息整合快速版 import combine_basic_info
 from flask_app.main.资格审查模块 import combine_review_standards
@ -223,7 +223,6 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
                logger.error(f"Error processing {key}: {exc}")
                yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
 #TODO:废标项，针对新文件作优化，统一成货物标的处理逻辑
 #TODO:基本信息，判断是否这里，打勾逻辑取消了。
 #TODO:缩进
 if __name__ == "__main__":
--- a/flask_app/main/形式响应评审.py
+++ b/flask_app/main/形式响应评审.py
@ -317,6 +317,7 @@ def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, claus
            file_id = upload_file(output_path)
            results = multi_threading(formatted_questions, "", file_id, 2)
            first_response_list = [clean_json_string(res) for _, res in results] if results else []
            print(first_response_list)
    updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
    return updated_json
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -580,18 +580,14 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
        logger.error(f"Error in truncate_pdf_specific_engineering: {e}")
        return [""] * len(selections)  # 返回与 selections 数量相同的空字符串列表
 # TODO:需要完善二次请求。目前invalid一定能返回 前附表  须知正文如果为空的话要额外处理一下，比如说就不进行跳转（见xx表） 开评定标这里也要考虑  如果评分表为空，也要处理。
 # TODO:目前merged_baseinfo没有包含投标人须知正文。
 #TODO:zbtest20有问题
 # 投标人须知前附表改为货物标一样的
 if __name__ == "__main__":
    start_time = time.time()
    # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
-    input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4.pdf"
+    input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
    files=truncate_pdf_multiple(input_path,output_folder)
    # selections = [4, 1]  # 仅处理 selection 4、1
--- a/flask_app/main/截取pdf_old.py
+++ b/flask_app/main/截取pdf_old.py
@ -515,7 +515,6 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
        return [""] * len(selections)  # 返回与 selections 数量相同的空字符串列表
 # TODO:需要完善二次请求。目前invalid一定能返回 前附表  须知正文如果为空的话要额外处理一下，比如说就不进行跳转（见xx表） 开评定标这里也要考虑  如果评分表为空，也要处理。
 #TODO:zbtest8 zbtest18有问题  后期需要完善，截取需要截两次，第一次严格第二次宽松
 if __name__ == "__main__":
    input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest8.pdf"
--- a/flask_app/main/投标人须知正文提取指定内容工程标.py
+++ b/flask_app/main/投标人须知正文提取指定内容工程标.py
@ -115,12 +115,14 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
            "Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'")
    with open(clause_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
-        extracted_data = extract_between_sections(data, target_values)
+        extracted_data = extract_between_sections(data, target_values) #先使用大章节'二、投标文件'这种筛选
        if not extracted_data:
-            extracted_data = extract_json(data, target_values)  # 提取需要的数据
+            extracted_data = extract_json(data, target_values)  # 若没有，再使用'3.投标文件' 筛选
            if not extracted_data:
-                final_result = get_requirements_with_gpt(merged_baseinfo_path, type)  # 万一没用正则匹配到，那就调用大模型
+                final_result = get_requirements_with_gpt(merged_baseinfo_path, type)  # 万一都没，那就调用大模型
                return final_result
            print("老方法")
            print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
            final_result=extract_sections(extracted_data,target_values)  #后处理，生成键名
            return final_result
        else:
@ -138,10 +140,10 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
 if __name__ == "__main__":
    # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
-    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\merged_baseinfo_path_more.pdf"
+    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\05339b83-50bf-4405-905c-38625928840e\merged_baseinfo_path_more.pdf"
-    clause_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\tmp\clause1.json"
+    clause_path=r"D:\flask_project\flask_app\static\output\output1\05339b83-50bf-4405-905c-38625928840e\clause1.json"
    try:
-        res = extract_from_notice(merged_baseinfo_path,clause_path, 3)  # 可以改变此处的 type 参数测试不同的场景
+        res = extract_from_notice(merged_baseinfo_path,clause_path, 1)  # 可以改变此处的 type 参数测试不同的场景
        res2 = json.dumps(res, ensure_ascii=False, indent=4)
        print(res2)
    except ValueError as e:
--- a/flask_app/main/解析old.py
+++ b/flask_app/main/解析old.py
@ -8,7 +8,7 @@ from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.main.基础信息整合快速版 import combine_basic_info
 from flask_app.main.资格审查模块 import combine_review_standards
--- a/flask_app/main/资格评审.py
+++ b/flask_app/main/资格评审.py
@ -134,7 +134,6 @@ def get_consortium_dict(merged_baseinfo_path):
        consortium_dict = clean_json_string(results1)
    return consortium_dict
 #TODO：修改问题
 def get_all_dict(invalid_path, ques=None):
    if ques is None:
        ques = []
--- a/flask_app/old_version/基础信息整合.py
+++ b/flask_app/old_version/基础信息整合.py
@ -1,7 +1,7 @@
 import json
 from flask_app.general.json_utils import clean_json_string, rename_outer_key
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
 from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.general.通义千问long import upload_file
--- a/flask_app/old_version/招标文件解析.py
+++ b/flask_app/old_version/招标文件解析.py
@ -9,7 +9,7 @@ from flask_app.old_version.文档理解大模型版知识库处理.知识库操
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
-from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
+from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.old_version.基础信息整合 import combine_basic_info
 from flask_app.old_version.资格审查模块old import combine_review_standards
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -167,7 +167,7 @@ def get_technical_requirements(file_id,invalid_path):
        file_id=upload_file(invalid_path)
        print("调用invalid_path")
    user_query1 = """
-请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，注意不要返回'说明'或'规格'或'技术参数'列中的内容；若没有采购清单，你要从表格中或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位。以下为需要考虑的特殊情况：如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增,例如若采购清单中有两种型号的'交换机'，那么你应返回两个键名，'交换机-1'和'交换机-2'；如有未知内容，在对应键值处填'未知'。以下为考虑了特殊情况的示例输出：
+请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，注意不要返回'说明'或'规格'或'技术参数'列中的内容；若没有采购清单，你要从表格中或文中摘取需要采购的系统和货物，采购需求中可能包含层次关系，例如采购的某系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位。以下为需要考虑的特殊情况：如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增,例如若采购清单中有两种型号的'交换机'，那么你应返回两个键名，'交换机-1'和'交换机-2'；如有未知内容，在对应键值处填'未知'。以下为考虑了特殊情况的示例输出：
 {
    "采购需求": {
        "交换机-1"：{},
@ -181,6 +181,20 @@ def get_technical_requirements(file_id,invalid_path):
    }
 }
    """
 #     user_query1 = """
 #     请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我该项目需要采购的系统或货物，要求回答全面不要遗漏，最细提取到具体的货物名称，而不是货物的功能需求。如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，注意不要返回'说明'或'规格'或'技术参数'列中的内容；若没有采购清单，你要从表格中或文中摘取需要采购的系统和货物。请以json格式返回结果，外层键名为采购的系统或货物名称，注意：采购需求中可能包含层次关系，例如采购的某系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，此时内层键名为该系统所需的货物名，需与原文保持一致，无需给出采购数量和单位。以下为需要考虑的特殊情况：如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同，请你以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增,例如若采购清单中有两种型号的'交换机'，那么你应返回两个键名，'交换机-1'和'交换机-2'；如果采购的某系统说明了该系统整体功能，那么在其内层键名中除了有该系统包含的货物，还应包含'系统功能'，具体键名同原文中的描述；如有未知内容，在对应键值处填'未知'。以下为考虑了特殊情况的示例输出：
 #     {
 #         "交换机-1"：{},
 #         "交换机-2":{},
 #         "门禁管理系统": {},
 #         "交通监控视频子系统": {
 #             "系统功能":{}
 #             "高清视频抓拍像机":{},
 #             "补光灯":{}
 #         },
 #         "LED全彩显示屏": {}
 #     }
 #         """
    res = qianwen_long(file_id, user_query1)
    print(res)
    cleaned_res = clean_json_string(res)     #转字典
@ -271,8 +285,8 @@ def test_all_files_in_folder(input_folder, output_folder):
                print(f"处理文件 {file_path} 时出错: {e}")
 if __name__ == "__main__":
-    truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
+    # truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
-    # # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
+    truncate_file=r"C:\Users\Administrator\Desktop\货物标\output1\招标文件(107国道)_procurement.docx"
    # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
    # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
    # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"