# -*- encoding:utf-8 -*- import json import logging import os import time from flask_app.main.format_change import docx2pdf, pdf2docx from flask_app.main.json_utils import clean_json_string from flask_app.main.判断是否分包等 import merge_json_to_list, read_questions_from_judge from flask_app.main.基础信息整合 import judge_consortium_bidding from flask_app.main.多线程提问 import read_questions_from_file, multi_threading from flask_app.main.通义千问long import upload_file from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods from flask_app.货物标.货物标截取pdf import truncate_pdf_specific from flask_app.main.post_processing import inner_post_processing def get_global_logger(unique_id): if unique_id is None: return logging.getLogger() # 获取默认的日志器 logger = logging.getLogger(unique_id) return logger logger = None def merge(merged): guarantee_key = '是否递交投标保证金' if '是否递交投标保证金' in merged else '是否递交磋商保证金' if merged.get(guarantee_key) == '是': return 1,"" elif merged.get(guarantee_key) == '否': guarantee_type = '投标' if '投标' in guarantee_key else '磋商' merged[f'{guarantee_type}保证金'] = '不提交' merged[f'退还{guarantee_type}保证金'] = '/' merged.pop(guarantee_key, None) return 0,merged def get_goods_baseinfo(baseinfo_file_path): file_id = upload_file(baseinfo_file_path) # baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt' baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt' questions = read_questions_from_file(baseinfo_file_path) more_query = "请你根据招标文件信息,回答以下问题:是否需要递交投标保证金(或磋商保证金)?请按json格式给我提供信息,键名分为'是否递交投标保证金'(或'是否递交磋商保证金'),键值仅限于'是','否','未知',若存在矛盾信息,请回答'未知'。" questions.append(more_query) baseinfo_results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else [] type,merged=merge(baseinfo_list.pop()) if type: judge_questions="根据招标文件第二章投标人须知,该项目投标保证金(或磋商保证金)的内容或要求是什么?请按json格式给我提供信息,外层键名为'投标保证金'(或'磋商保证金'),若需要以嵌套键值对返回结果,那么嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致。" res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long if not res2: print("基础信息整合: multi_threading error!") else: for question, response in res2: baseinfo_list.append(clean_json_string(response)) else: baseinfo_list.append(merged) return baseinfo_list #货物标 def little_parse_goods(output_folder,file_path): files=truncate_pdf_specific(file_path,output_folder) baseinfo_list=get_goods_baseinfo(files[-1]) aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list) return {"基础信息": aggregated_baseinfo} def little_parse_engineering(output_folder,downloaded_filepath): return True def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id): """ 主解析函数,根据文件类型和招标类型处理文件,并保存结果为JSON文件。 参数: output_folder (str): 输出文件夹路径。 file_path (str): 待处理文件的路径。 file_type (int): 文件类型(1: docx, 2: pdf, 3: doc)。 zb_type (int): 招标类型(2: 货物标, 其他: 工程标)。 返回: str: 保存的JSON文件的路径。 """ global logger logger = get_global_logger(unique_id) # 根据文件类型处理文件路径 if file_type == 1: # docx docx_path = file_path pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 elif file_type == 2: # pdf pdf_path = file_path # docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 elif file_type == 3: # doc pdf_path = docx2pdf(file_path) # docx_path = pdf2docx(pdf_path) else: logger.error("Unsupported file type provided. Preprocessing halted.") return None # 根据招标类型调用相应的解析函数 if zb_type == 2: # 货物标 combined_data = little_parse_goods(output_folder, pdf_path) logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4)) res = inner_post_processing(combined_data["基础信息"]) else: combined_data = little_parse_engineering(output_folder, pdf_path) logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4)) res = inner_post_processing(combined_data["基础信息"]) # 定义保存JSON文件的路径 json_path = os.path.join(output_folder, "final_result.json") # 将结果保存为JSON文件 with open(json_path, 'w', encoding='utf-8') as json_file: json.dump(res, json_file, ensure_ascii=False, indent=4) logger.info(f"Extraction results saved to {json_path}") # 返回JSON文件的路径 return json_path if __name__ == "__main__": output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\little_parse_output" start_time = time.time() file_type = 2 # 1:docx 2:pdf 3:其他 zb_type=2 #1:工程标 2:货物标 input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf" res=little_parse_main(output_folder, input_file, file_type, zb_type,"122334") print(json.dumps(res, ensure_ascii=False, indent=4)) end_time = time.time() elapsed_time = end_time - start_time # 计算耗时 print(f"Function execution took {elapsed_time} seconds.")