diff --git a/flask_app/general/纯技术参数要求提取.py b/flask_app/general/纯技术参数要求提取.py index 1856a09..03babd8 100644 --- a/flask_app/general/纯技术参数要求提取.py +++ b/flask_app/general/纯技术参数要求提取.py @@ -1,8 +1,19 @@ -from flask_app.general.format_change import pdf2docx +import json +import logging + +from flask_app.general.format_change import pdf2docx, docx2pdf from flask_app.general.通义千问long import upload_file from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main from flask_app.货物标.技术参数要求提取 import get_technical_requirements +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + + +logger = None def extract_matching_keys(data_dict, good_list): """ @@ -31,23 +42,38 @@ def extract_matching_keys(data_dict, good_list): recurse(data_dict) return result -def get_technical_requirements_main(file_path,output_folder): - truncate_file=truncate_pdf_main(file_path,output_folder,5)[0] +def get_technical_requirements_main(file_path,file_type,unique_id,output_folder): + global logger + logger = get_global_logger(unique_id) + if file_type == 1: # docx + docx_path = file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = file_path + elif file_type == 3: # doc + pdf_path = docx2pdf(file_path) + else: + logger.error("Unsupported file type provided. Preprocessing halted.") + return None + truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0] if not truncate_file: - truncate_file=file_path #直接传整份文件 + truncate_file=pdf_path #直接传整份文件 truncate_file_docx=pdf2docx(truncate_file) file_id=upload_file(truncate_file_docx) # file_id=upload_file(truncate_file) - final_res=get_technical_requirements(file_id) + final_res=get_technical_requirements(file_id,pdf_path) # 安全地提取 "技术要求" 内部的字典内容 if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict): technical_requirements = final_res['技术要求'] good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 [] + print(good_list) + logger.info("Collected good_list from the processing function: %s", good_list) return extract_matching_keys(technical_requirements,good_list) else: return final_res if __name__ == "__main__": - file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile.pdf" + file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf" + file_type=2 output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp" - res=get_technical_requirements_main(file_path,output_folder) - print(res) + res=get_technical_requirements_main(file_path,file_type,"123",output_folder) + print(json.dumps(res,ensure_ascii=False,indent=4)) diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py index e5f24c7..61a0887 100644 --- a/flask_app/main/start_up.py +++ b/flask_app/main/start_up.py @@ -12,8 +12,10 @@ from flask_app.general.post_processing import outer_post_processing from flask_app.main.工程标解析main import engineering_bid_main from flask_app.货物标.货物标解析main import goods_bid_main from flask_app.general.纯技术参数要求提取 import get_technical_requirements_main + app = Flask(__name__) + class CSTFormatter(logging.Formatter): """自定义的 Formatter,将日志的时间戳调整为中国标准时间(UTC+8)""" @@ -156,11 +158,13 @@ def validate_request(default_zb_type=1): return jsonify({'error': 'Invalid zb_type provided'}), 400 return file_url, zb_type -#提取采购需求 + +# 提取采购需求 @app.route('/procurement_reqs', methods=['POST']) def get_procurement_reqs(): logger = g.logger - output_folder=g.output_folder + output_folder = g.output_folder + unique_id=g.unique_id file_url, zb_type = validate_request() if isinstance(file_url, tuple): # Check if the returned value is an error response return file_url @@ -173,8 +177,8 @@ def get_procurement_reqs(): 'message': 'This endpoint only supports zb_type 2 (procurement requirements)' }), 400 else: - final_res_path=os.path.join(output_folder,'final_result.json') - response = download_and_process_file_for_procurement(file_url) + final_res_path = os.path.join(output_folder, 'final_result.json') + response = download_and_process_file_for_procurement(file_url,unique_id) try: with open(final_res_path, 'w', encoding='utf-8') as json_file: json.dump(response, json_file, ensure_ascii=False, indent=4) @@ -190,8 +194,9 @@ def get_procurement_reqs(): logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 return jsonify({'error': str(e)}), 500 -#提取采购需求 -def download_and_process_file_for_procurement(file_url): + +# 提取采购需求 +def download_and_process_file_for_procurement(file_url,unique_id): """ 下载并处理采购需求文件。 @@ -211,18 +216,19 @@ def download_and_process_file_for_procurement(file_url): logger.error("Unsupported file type or failed to download file") return None logger.info("Local file path: " + downloaded_filepath) - res = get_technical_requirements_main(downloaded_filepath, output_folder) + res = get_technical_requirements_main(downloaded_filepath, file_type, unique_id,output_folder) return res -@app.route('/little_zbparse',methods=['POST']) + +@app.route('/little_zbparse', methods=['POST']) def little_zbparse(): - logger=g.logger - file_url,zb_type = validate_request() + logger = g.logger + file_url, zb_type = validate_request() if isinstance(file_url, tuple): # Check if the returned value is an error response return file_url try: logger.info("starting parsing url:" + file_url) - final_json_path= download_and_process_file(file_url,zb_type) + final_json_path = download_and_process_file(file_url, zb_type) if not final_json_path: return jsonify({'error': 'File processing failed'}), 500 response = generate_response(final_json_path) # 先获取响应内容 @@ -232,6 +238,7 @@ def little_zbparse(): logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 return jsonify({'error': str(e)}), 500 + def download_and_process_file(file_url, zb_type): """ 下载并处理文件,根据zb_type选择处理函数。 @@ -259,6 +266,7 @@ def download_and_process_file(file_url, zb_type): processed_file_path = little_parse_main(output_folder, downloaded_filepath, file_type, zb_type, g.unique_id) return processed_file_path + def generate_response(final_json_path): logger = g.logger # 检查final_json_path是否为空或None @@ -287,16 +295,17 @@ def zbparse(): # 获取并显示接收到的 JSON 数据 received_data = request.get_json() logger.info("Received JSON data: " + str(received_data)) - file_url,zb_type = validate_request() + file_url, zb_type = validate_request() if isinstance(file_url, tuple): # Check if the returned value is an error response return file_url try: logger.info("starting parsing url:" + file_url) - return Response(stream_with_context(process_and_stream(file_url,zb_type)), content_type='text/event-stream') + return Response(stream_with_context(process_and_stream(file_url, zb_type)), content_type='text/event-stream') except Exception as e: logger.error('Exception occurred: ' + str(e)) return jsonify({'error': str(e)}), 500 + # 分段返回 def process_and_stream(file_url, zb_type): """ @@ -352,7 +361,7 @@ def process_and_stream(file_url, zb_type): 1: engineering_bid_main, 2: goods_bid_main } - processing_func = processing_functions.get(zb_type, engineering_bid_main) #默认按工程标解析 + processing_func = processing_functions.get(zb_type, engineering_bid_main) # 默认按工程标解析 # 从 processing_func 获取数据 for data in processing_func(output_folder, downloaded_filepath, file_type, unique_id): @@ -386,17 +395,18 @@ def process_and_stream(file_url, zb_type): } yield f"data: {json.dumps(response, ensure_ascii=False)}\n\n" - base_end_time=time.time() + base_end_time = time.time() logger.info(f"分段解析完成,耗时:{base_end_time - start_time:.2f} 秒") # **保存 combined_data 到 output_folder 下的 'final_result.json'** output_json_path = os.path.join(output_folder, 'final_result.json') - extracted_info_path=os.path.join(output_folder, 'extracted_result.json') + extracted_info_path = os.path.join(output_folder, 'extracted_result.json') includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"] - final_result, extracted_info,procurement_reqs = outer_post_processing(combined_data, includes,good_list) + final_result, extracted_info, procurement_reqs = outer_post_processing(combined_data, includes, good_list) - logger.info(f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}") # 添加日志记录 - #采购需求 - procurement_reqs_response={ + logger.info( + f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}") # 添加日志记录 + # 采购需求 + procurement_reqs_response = { 'message': 'procurement_reqs', 'filename': os.path.basename(downloaded_filepath), 'data': json.dumps(procurement_reqs, ensure_ascii=False) @@ -417,7 +427,7 @@ def process_and_stream(file_url, zb_type): except IOError as e: logger.error(f"保存JSON文件时出错: {e}") - #提取的数据 + # 提取的数据 extracted_info_response = { 'message': 'extracted_info', 'filename': os.path.basename(downloaded_filepath), diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index 8466848..65133c8 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -347,7 +347,7 @@ def truncate_pdf_main(input_path, output_folder, selection): r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()()]*\s*$|\s*评标(办法|方法)前附表\s*$', re.MULTILINE ), - re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) + re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) ) ] output_suffix = "evaluation_method" @@ -364,11 +364,11 @@ def truncate_pdf_main(input_path, output_folder, selection): # ), ( re.compile( - r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]*资格[\u4e00-\u9fff]*\s*$', + r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*资格[\u4e00-\u9fff、()()]*\s*$', re.MULTILINE), re.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*|' - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$', re.MULTILINE) + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) ) ] output_suffix = "qualification" diff --git a/flask_app/货物标/基础信息解析main.py b/flask_app/货物标/基础信息解析main.py index e4c5e7b..6049ed9 100644 --- a/flask_app/货物标/基础信息解析main.py +++ b/flask_app/货物标/基础信息解析main.py @@ -158,7 +158,7 @@ def get_base_info(merged_baseinfo_path,clause_path): # baseinfo_list.append(clean_json_string(response)) return baseinfo_list -def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path): +def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path,invalid_path): baseinfo_list = [] temp_list = [] procurement_reqs = {} @@ -169,7 +169,7 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpat # 定义一个线程函数来获取采购需求 def fetch_procurement_reqs_thread(): nonlocal procurement_reqs - procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath) + procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path) # 创建并启动获取基础信息的线程 thread1 = threading.Thread(target=get_base_info_thread) thread1.start() diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 8a1acf8..e047fb6 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -150,13 +150,17 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter def get_patterns_for_procurement(): + # begin_pattern = re.compile( + # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*', + # re.MULTILINE) begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*', - re.MULTILINE) - end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) - # begin_pattern=re.compile( - # r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(?:服务|项目|商务|技术)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$' - # ) + r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” + r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符 + r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求” + r'(?:采购|需求)[\u4e00-\u9fff、()()]*?)\s*$', # 或者匹配“采购”或“需求” + re.MULTILINE + ) + end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) return begin_pattern, end_pattern @@ -167,14 +171,14 @@ def get_patterns_for_evaluation_method(): # ) begin_pattern = re.compile( r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” - r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号) - r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” + r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号) + r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。 r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法” r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾 re.MULTILINE ) end_pattern = re.compile( - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) return begin_pattern, end_pattern def get_patterns_for_qualification(): @@ -381,7 +385,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h re.MULTILINE ) new_end_pattern = re.compile( - r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$', + r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE ) print("第三次尝试 tobidders_notice! ") @@ -446,10 +450,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page): begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+' + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE ) end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分 + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',re.MULTILINE # 捕获中文部分 ) exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 @@ -654,7 +658,7 @@ def process_input(input_path, output_folder, selection, output_suffix): # 根据选择设置对应的模式和结束模式 if selection == 1: begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE) - end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) + end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) local_output_suffix = "notice" elif selection == 2: begin_pattern = re.compile( @@ -673,7 +677,7 @@ def process_input(input_path, output_folder, selection, output_suffix): local_output_suffix = "tobidders_notice" elif selection == 5: begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*') + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') #包头中有一章'采购相关说明' end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "procurement" @@ -799,7 +803,7 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 #ztbfile.pdf少资格评审 包头少符合性评审 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index ffee0da..1c2cb06 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -89,9 +89,14 @@ def postprocess(data): # 递归处理顶层数据 return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()} -def get_technical_requirements(file_id): +def get_technical_requirements(file_id,invalid_path): + first_query="该文档中是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'" + judge_res=qianwen_long(file_id,first_query) + print(judge_res) + if '否' in judge_res: + file_id=upload_file(invalid_path) user_query1 = """ -这是一份货物标中采购要求部分的内容,请告诉我需要采购的货物,如果有采购清单,请直接根据清单上的货物(或系统)名称给出结果,若没有采购清单,你要从表格中或文中摘取需要采购的系统(或货物),采购需求中可能包含层次关系,例如采购的某大系统中可能包含几种货物,那么你需要用嵌套键值对表示这种关系,且不要遗漏该系统中包含的货物,你的输出请以json格式返回,最外层键名为'采购需求',嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填'未知'。以下为示例输出: +请你首先定位该采购文件中的采购清单或采购需求部分,请告诉我需要采购的货物,如果有采购清单,请直接根据清单上的货物(或系统)名称给出结果,若没有采购清单,你要从表格中或文中摘取需要采购的系统(或货物),采购需求中可能包含层次关系,例如采购的某大系统中可能包含几种货物,那么你需要用嵌套键值对表示这种关系,且不要遗漏该系统中包含的货物,你的输出请以json格式返回,最外层键名为'采购需求',嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填'未知'。以下为示例输出: { "采购需求": { "门禁管理系统": {}, @@ -105,9 +110,8 @@ def get_technical_requirements(file_id): } """ res = qianwen_long(file_id, user_query1) - # print(res) + print(res) cleaned_res = clean_json_string(res) #转字典 - # print(res) keys_list,good_list,no_keys_added= generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单 if '采购需求' in cleaned_res: cleaned_res['技术要求'] = cleaned_res.pop('采购需求') @@ -172,14 +176,17 @@ def test_all_files_in_folder(input_folder, output_folder): print(f"处理文件 {file_path} 时出错: {e}") if __name__ == "__main__": - # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" - # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx" - truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx" - output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp" - file_id = upload_file(truncate_file) - res=get_technical_requirements(file_id) + # # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" + # # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx" + # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf" + # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx" + # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp" + # file_id = upload_file(truncate_file) + invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf" + file_id="file-fe-FcOjv4FiOGjHRG1pKaFrIBeG" + res=get_technical_requirements(file_id,invalid_path) json_string = json.dumps(res, ensure_ascii=False, indent=4) print(json_string) - # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" - # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3" - # test_all_files_in_folder(input_folder, output_folder) + # # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" + # # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3" + # # test_all_files_in_folder(input_folder, output_folder) \ No newline at end of file diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 29215cd..9fd0bca 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -7,7 +7,7 @@ from flask_app.货物标.商务服务其他要求提取 import get_business_requ #获取采购清单 -def fetch_procurement_reqs(procurement_path,procurement_docpath): +def fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path): # 定义默认的 procurement_reqs 字典 DEFAULT_PROCUREMENT_REQS = { "技术要求": "", @@ -25,8 +25,8 @@ def fetch_procurement_reqs(procurement_path,procurement_docpath): # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: # 提交任务给线程池 - future_technical = executor.submit(get_technical_requirements, file_id) - time.sleep(1) # 如果需要延迟,可以保留,否则建议移除以提高效率 + future_technical = executor.submit(get_technical_requirements, file_id,invalid_path) + time.sleep(0.5) future_business = executor.submit(get_business_requirements, procurement_path, file_id) # 获取并行任务的结果 diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index 5f31508..1e2206b 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -90,7 +90,7 @@ def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path, merged_baseinfo_path = invalid_path if not procurement_docpath: procurement_docpath=invalid_docpath - basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path) + basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path,invalid_path) base_info, good_list = post_process_baseinfo(basic_res) end_time = time.time() logger.info(f"基础信息 done,耗时:{end_time - start_time:.2f} 秒")