11.6修复bug

2024-11-06 12:20:24 +08:00 · 2024-11-06 12:20:24 +08:00 · 3bd548ea81
commit 3bd548ea81
parent d4d1a14c06
8 changed files with 113 additions and 66 deletions
--- a/flask_app/general/纯技术参数要求提取.py
+++ b/flask_app/general/纯技术参数要求提取.py
@ -1,8 +1,19 @@
-from flask_app.general.format_change import pdf2docx
+import json
+import logging
+
+from flask_app.general.format_change import pdf2docx, docx2pdf
 from flask_app.general.通义千问long import upload_file
 from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
 from flask_app.货物标.技术参数要求提取 import get_technical_requirements

+def get_global_logger(unique_id):
+    if unique_id is None:
+        return logging.getLogger()  # 获取默认的日志器
+    logger = logging.getLogger(unique_id)
+    return logger
+
+
+logger = None

 def extract_matching_keys(data_dict, good_list):
    """
@ -31,23 +42,38 @@ def extract_matching_keys(data_dict, good_list):
    recurse(data_dict)
    return result

-def get_technical_requirements_main(file_path,output_folder):
-    truncate_file=truncate_pdf_main(file_path,output_folder,5)[0]
+def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
+    global logger
+    logger = get_global_logger(unique_id)
+    if file_type == 1:  # docx
+        docx_path = file_path
+        pdf_path = docx2pdf(docx_path)  # 将docx转换为pdf以供后续处理
+    elif file_type == 2:  # pdf
+        pdf_path = file_path
+    elif file_type == 3:  # doc
+        pdf_path = docx2pdf(file_path)
+    else:
+        logger.error("Unsupported file type provided. Preprocessing halted.")
+        return None
+    truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
    if not truncate_file:
-        truncate_file=file_path    #直接传整份文件
+        truncate_file=pdf_path    #直接传整份文件
    truncate_file_docx=pdf2docx(truncate_file)
    file_id=upload_file(truncate_file_docx)
    # file_id=upload_file(truncate_file)
-    final_res=get_technical_requirements(file_id)
+    final_res=get_technical_requirements(file_id,pdf_path)
    # 安全地提取 "技术要求" 内部的字典内容
    if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
        technical_requirements = final_res['技术要求']
        good_list = technical_requirements.pop('货物列表', [])  # 如果 '货物列表' 不存在，返回 []
+        print(good_list)
+        logger.info("Collected good_list from the processing function: %s", good_list)
        return extract_matching_keys(technical_requirements,good_list)
    else:
        return final_res
 if __name__ == "__main__":
-    file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile.pdf"
+    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
+    file_type=2
    output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
-    res=get_technical_requirements_main(file_path,output_folder)
-    print(res)
+    res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
+    print(json.dumps(res,ensure_ascii=False,indent=4))
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@ -12,8 +12,10 @@ from flask_app.general.post_processing import outer_post_processing
 from flask_app.main.工程标解析main import engineering_bid_main
 from flask_app.货物标.货物标解析main import goods_bid_main
 from flask_app.general.纯技术参数要求提取 import get_technical_requirements_main
+
 app = Flask(__name__)

+
 class CSTFormatter(logging.Formatter):
    """自定义的 Formatter，将日志的时间戳调整为中国标准时间（UTC+8）"""

@ -156,11 +158,13 @@ def validate_request(default_zb_type=1):
        return jsonify({'error': 'Invalid zb_type provided'}), 400
    return file_url, zb_type

-#提取采购需求
+
+# 提取采购需求
@app.route('/procurement_reqs', methods=['POST'])
 def get_procurement_reqs():
    logger = g.logger
-    output_folder=g.output_folder
+    output_folder = g.output_folder
+    unique_id=g.unique_id
    file_url, zb_type = validate_request()
    if isinstance(file_url, tuple):  # Check if the returned value is an error response
        return file_url
@ -173,8 +177,8 @@ def get_procurement_reqs():
                'message': 'This endpoint only supports zb_type 2 (procurement requirements)'
            }), 400
        else:
-            final_res_path=os.path.join(output_folder,'final_result.json')
-            response = download_and_process_file_for_procurement(file_url)
+            final_res_path = os.path.join(output_folder, 'final_result.json')
+            response = download_and_process_file_for_procurement(file_url,unique_id)
            try:
                with open(final_res_path, 'w', encoding='utf-8') as json_file:
                    json.dump(response, json_file, ensure_ascii=False, indent=4)
@ -190,8 +194,9 @@ def get_procurement_reqs():
        logger.error('Exception occurred: ' + str(e))  # 使用全局 logger 记录
        return jsonify({'error': str(e)}), 500

-#提取采购需求
-def download_and_process_file_for_procurement(file_url):
+
+# 提取采购需求
+def download_and_process_file_for_procurement(file_url,unique_id):
    """
    下载并处理采购需求文件。

@ -211,18 +216,19 @@ def download_and_process_file_for_procurement(file_url):
        logger.error("Unsupported file type or failed to download file")
        return None
    logger.info("Local file path: " + downloaded_filepath)
-    res = get_technical_requirements_main(downloaded_filepath, output_folder)
+    res = get_technical_requirements_main(downloaded_filepath, file_type, unique_id,output_folder)
    return res

-@app.route('/little_zbparse',methods=['POST'])
+
+@app.route('/little_zbparse', methods=['POST'])
 def little_zbparse():
-    logger=g.logger
-    file_url,zb_type = validate_request()
+    logger = g.logger
+    file_url, zb_type = validate_request()
    if isinstance(file_url, tuple):  # Check if the returned value is an error response
        return file_url
    try:
        logger.info("starting parsing url:" + file_url)
-        final_json_path= download_and_process_file(file_url,zb_type)
+        final_json_path = download_and_process_file(file_url, zb_type)
        if not final_json_path:
            return jsonify({'error': 'File processing failed'}), 500
        response = generate_response(final_json_path)  # 先获取响应内容
@ -232,6 +238,7 @@ def little_zbparse():
        logger.error('Exception occurred: ' + str(e))  # 使用全局 logger 记录
        return jsonify({'error': str(e)}), 500

+
 def download_and_process_file(file_url, zb_type):
    """
    下载并处理文件，根据zb_type选择处理函数。
@ -259,6 +266,7 @@ def download_and_process_file(file_url, zb_type):
    processed_file_path = little_parse_main(output_folder, downloaded_filepath, file_type, zb_type, g.unique_id)
    return processed_file_path

+
 def generate_response(final_json_path):
    logger = g.logger
    # 检查final_json_path是否为空或None
@ -287,16 +295,17 @@ def zbparse():
    # 获取并显示接收到的 JSON 数据
    received_data = request.get_json()
    logger.info("Received JSON data: " + str(received_data))
-    file_url,zb_type = validate_request()
+    file_url, zb_type = validate_request()
    if isinstance(file_url, tuple):  # Check if the returned value is an error response
        return file_url
    try:
        logger.info("starting parsing url:" + file_url)
-        return Response(stream_with_context(process_and_stream(file_url,zb_type)), content_type='text/event-stream')
+        return Response(stream_with_context(process_and_stream(file_url, zb_type)), content_type='text/event-stream')
    except Exception as e:
        logger.error('Exception occurred: ' + str(e))
        return jsonify({'error': str(e)}), 500

+
 # 分段返回
 def process_and_stream(file_url, zb_type):
    """
@ -352,7 +361,7 @@ def process_and_stream(file_url, zb_type):
            1: engineering_bid_main,
            2: goods_bid_main
        }
-        processing_func = processing_functions.get(zb_type, engineering_bid_main)  #默认按工程标解析
+        processing_func = processing_functions.get(zb_type, engineering_bid_main)  # 默认按工程标解析

        # 从 processing_func 获取数据
        for data in processing_func(output_folder, downloaded_filepath, file_type, unique_id):
@ -386,17 +395,18 @@ def process_and_stream(file_url, zb_type):
            }
            yield f"data: {json.dumps(response, ensure_ascii=False)}\n\n"

-        base_end_time=time.time()
+        base_end_time = time.time()
        logger.info(f"分段解析完成，耗时：{base_end_time - start_time:.2f} 秒")
        # **保存 combined_data 到 output_folder 下的 'final_result.json'**
        output_json_path = os.path.join(output_folder, 'final_result.json')
-        extracted_info_path=os.path.join(output_folder, 'extracted_result.json')
+        extracted_info_path = os.path.join(output_folder, 'extracted_result.json')
        includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
-        final_result, extracted_info,procurement_reqs = outer_post_processing(combined_data, includes,good_list)
+        final_result, extracted_info, procurement_reqs = outer_post_processing(combined_data, includes, good_list)

-        logger.info(f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}")  # 添加日志记录
-        #采购需求
-        procurement_reqs_response={
+        logger.info(
+            f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}")  # 添加日志记录
+        # 采购需求
+        procurement_reqs_response = {
            'message': 'procurement_reqs',
            'filename': os.path.basename(downloaded_filepath),
            'data': json.dumps(procurement_reqs, ensure_ascii=False)
@ -417,7 +427,7 @@ def process_and_stream(file_url, zb_type):
        except IOError as e:
            logger.error(f"保存JSON文件时出错: {e}")

-        #提取的数据
+        # 提取的数据
        extracted_info_response = {
            'message': 'extracted_info',
            'filename': os.path.basename(downloaded_filepath),
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -347,7 +347,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
                        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()（）]*\s*$|\s*评标(办法|方法)前附表\s*$',
                        re.MULTILINE
                    ),
-                    re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
+                    re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
                )
            ]
            output_suffix = "evaluation_method"
@ -364,11 +364,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
                # ),
                (
                    re.compile(
-                        r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]*资格[\u4e00-\u9fff]*\s*$',
+                        r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]*资格[\u4e00-\u9fff、()（）]*\s*$',
                        re.MULTILINE),
                    re.compile(
                        r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:])(?!.*(?:资质|能力|信誉)).*|'
-                        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$', re.MULTILINE)
+                        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
                )
            ]
            output_suffix = "qualification"
--- a/flask_app/货物标/基础信息解析main.py
+++ b/flask_app/货物标/基础信息解析main.py
@ -158,7 +158,7 @@ def get_base_info(merged_baseinfo_path,clause_path):
    #         baseinfo_list.append(clean_json_string(response))
    return baseinfo_list

-def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path):
+def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path,invalid_path):
    baseinfo_list = []
    temp_list = []
    procurement_reqs = {}
@ -169,7 +169,7 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpat
    # 定义一个线程函数来获取采购需求
    def fetch_procurement_reqs_thread():
        nonlocal procurement_reqs
-        procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath)
+        procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path)
    # 创建并启动获取基础信息的线程
    thread1 = threading.Thread(target=get_base_info_thread)
    thread1.start()
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -150,13 +150,17 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter


 def get_patterns_for_procurement():
+    # begin_pattern = re.compile(
+    #     r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
+    #     re.MULTILINE)
    begin_pattern = re.compile(
-        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*',
-        re.MULTILINE)
-    end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
-    # begin_pattern=re.compile(
-    #     r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(?:服务|项目|商务|技术)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$'
-    # )
+        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
+        r'[\u4e00-\u9fff、()（）]*?'  # 匹配允许的字符
+        r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()（）]*?要求[\u4e00-\u9fff、()（）]*?\s*$|'  # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
+        r'(?:采购|需求)[\u4e00-\u9fff、()（）]*?)\s*$',  # 或者匹配“采购”或“需求”
+        re.MULTILINE
+    )
+    end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
    return begin_pattern, end_pattern


@ -167,14 +171,14 @@ def get_patterns_for_evaluation_method():
    # )
    begin_pattern = re.compile(
        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
-        r'(?:[\u4e00-\u9fff、()（）]*?)'  # 匹配允许的字符（中文、顿号、括号）
-        r'(?=.*(?:磋商|谈判|评标|评定|评审))'  # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
+        r'(?:[\u4e00-\u9fff、()（）]*?)'  # 匹配允许的字符（中文、顿号、括号）      
+        r'(?=.*(?:磋商|谈判|评标|评定|评审))'  # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”    注意这里的'.*'是允许这些关键词出现在任意位置，但主体匹配部分仍然受到字符集的限制。
        r'(?=.*(?:办法|方法))'  # 确保包含“办法”或“方法”
        r'[\u4e00-\u9fff、()（）]*\s*$',  # 继续匹配允许的字符直到行尾
        re.MULTILINE
    )
    end_pattern = re.compile(
-        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
+        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
    return begin_pattern, end_pattern

 def get_patterns_for_qualification():
@ -381,7 +385,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
                re.MULTILINE
            )
            new_end_pattern = re.compile(
-                r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
+                r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
                re.MULTILINE
            )
            print("第三次尝试 tobidders_notice! ")
@ -446,10 +450,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h

 def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
    begin_pattern = re.compile(
-        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
+        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
    )
    end_pattern = re.compile(
-        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 捕获中文部分
+        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',re.MULTILINE  # 捕获中文部分
    )
    exclusion_words = ["合同", "评标", "开标","评审","采购","资格"]  # 在这里添加需要排除的关键词

@ -654,7 +658,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
        # 根据选择设置对应的模式和结束模式
        if selection == 1:
            begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$', re.MULTILINE)
-            end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
+            end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
            local_output_suffix = "notice"
        elif selection == 2:
            begin_pattern = re.compile(
@ -673,7 +677,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
            local_output_suffix = "tobidders_notice"
        elif selection == 5:
            begin_pattern = re.compile(
-                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*')
+                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') #包头中有一章'采购相关说明'
            end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
            local_output_suffix = "procurement"

@ -799,7 +803,7 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1

 #ztbfile.pdf少资格评审  包头少符合性评审
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -89,9 +89,14 @@ def postprocess(data):
    # 递归处理顶层数据
    return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}

-def get_technical_requirements(file_id):
+def get_technical_requirements(file_id,invalid_path):
+    first_query="该文档中是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'"
+    judge_res=qianwen_long(file_id,first_query)
+    print(judge_res)
+    if '否' in judge_res:
+        file_id=upload_file(invalid_path)
    user_query1 = """
-这是一份货物标中采购要求部分的内容，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，若没有采购清单，你要从表格中或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
+请你首先定位该采购文件中的采购清单或采购需求部分，请告诉我需要采购的货物，如果有采购清单，请直接根据清单上的货物（或系统）名称给出结果，若没有采购清单，你要从表格中或文中摘取需要采购的系统（或货物），采购需求中可能包含层次关系，例如采购的某大系统中可能包含几种货物，那么你需要用嵌套键值对表示这种关系，且不要遗漏该系统中包含的货物，你的输出请以json格式返回，最外层键名为'采购需求'，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填'未知'。以下为示例输出：
 {
    "采购需求": {
        "门禁管理系统": {},
@ -105,9 +110,8 @@ def get_technical_requirements(file_id):
 }
    """
    res = qianwen_long(file_id, user_query1)
-    # print(res)
+    print(res)
    cleaned_res = clean_json_string(res)     #转字典
-    # print(res)
    keys_list,good_list,no_keys_added= generate_key_paths(cleaned_res['采购需求'])  # 提取需要采购的货物清单
    if '采购需求' in cleaned_res:
        cleaned_res['技术要求'] = cleaned_res.pop('采购需求')
@ -172,14 +176,17 @@ def test_all_files_in_folder(input_folder, output_folder):
                print(f"处理文件 {file_path} 时出错: {e}")

 if __name__ == "__main__":
-    # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
-    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
-    truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
-    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
-    file_id = upload_file(truncate_file)
-    res=get_technical_requirements(file_id)
+    # # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
+    # # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
+    # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
+    # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
+    # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
+    # file_id = upload_file(truncate_file)
+    invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
+    file_id="file-fe-FcOjv4FiOGjHRG1pKaFrIBeG"
+    res=get_technical_requirements(file_id,invalid_path)
    json_string = json.dumps(res, ensure_ascii=False, indent=4)
    print(json_string)
-    # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
-    # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
-    # test_all_files_in_folder(input_folder, output_folder)
+    # # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
+    # # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
+    # # test_all_files_in_folder(input_folder, output_folder)
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -7,7 +7,7 @@ from flask_app.货物标.商务服务其他要求提取 import get_business_requ


 #获取采购清单
-def fetch_procurement_reqs(procurement_path,procurement_docpath):
+def fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path):
    # 定义默认的 procurement_reqs 字典
    DEFAULT_PROCUREMENT_REQS = {
        "技术要求": "",
@ -25,8 +25,8 @@ def fetch_procurement_reqs(procurement_path,procurement_docpath):
        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # 提交任务给线程池
-            future_technical = executor.submit(get_technical_requirements, file_id)
-            time.sleep(1)  # 如果需要延迟，可以保留，否则建议移除以提高效率
+            future_technical = executor.submit(get_technical_requirements, file_id,invalid_path)
+            time.sleep(0.5)
            future_business = executor.submit(get_business_requirements, procurement_path, file_id)

            # 获取并行任务的结果
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -90,7 +90,7 @@ def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path,
        merged_baseinfo_path = invalid_path
    if not procurement_docpath:
        procurement_docpath=invalid_docpath
-    basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path)
+    basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path,invalid_path)
    base_info, good_list = post_process_baseinfo(basic_res)
    end_time = time.time()
    logger.info(f"基础信息 done，耗时：{end_time - start_time:.2f} 秒")