import concurrent.futures import json import time from flask_app.general.doubao import pdf2txt from flask_app.general.file2markdown import convert_pdf_to_markdown from flask_app.general.format_change import pdf2docx from flask_app.货物标.技术参数要求提取 import get_technical_requirements from flask_app.general.通义千问long import upload_file from flask_app.货物标.商务服务其他要求提取 import get_business_requirements # 获取采购清单 def fetch_procurement_reqs(procurement_path, invalid_path): procurement_docpath = pdf2docx(procurement_path) # 采购需求docx # 定义默认的 procurement_reqs 字典 DEFAULT_PROCUREMENT_REQS = { "采购需求": "", "技术要求": "", "商务要求": "", "服务要求": "", "其他要求": "" } # 如果 procurement_docpath 是空字符串,直接返回包含空字符串的字典 if not procurement_path: return DEFAULT_PROCUREMENT_REQS.copy() try: # processed_filepath = convert_pdf_to_markdown(procurement_path) # 转markdown格式 processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: # 提交任务给线程池 future_technical = executor.submit(get_technical_requirements, procurement_docpath, invalid_path, processed_filepath) time.sleep(0.5) # 保持原有的延时 future_business = executor.submit(get_business_requirements, procurement_path, procurement_docpath) # 获取并行任务的结果 technical_requirements = future_technical.result() business_requirements = future_business.result() # 构建最终的采购需求字典 procurement_reqs = { "采购需求": technical_requirements.get("采购需求", {}) } # 合并 business_requirements 到 procurement_reqs 中 # 这样无论 business_requirements 包含什么键(如 "技术要求"、"服务要求" 或 "技术、服务要求"),都将被保留 procurement_reqs.update(business_requirements) # 如果需要确保所有默认键存在,可以取消下面的注释 # for key, default_value in DEFAULT_PROCUREMENT_REQS.items(): # procurement_reqs.setdefault(key, default_value) return procurement_reqs except Exception as e: print(f"Error in fetch_procurement_reqs: {e}") # 在出错时返回默认的包含空字符串的字典 return DEFAULT_PROCUREMENT_REQS.copy() # TODO:技术要求可以在技术参数之后执行,把完整的技术参数输入,问大模型,除了上述内容还有哪些,这样的话把技术标和其他的区分开。 # TODO: 094有问题 if __name__ == "__main__": start_time = time.time() output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf" procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf" procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" res = fetch_procurement_reqs(procurement_path, invalid_path) print(json.dumps(res, ensure_ascii=False, indent=4)) end_time = time.time() print("耗时:" + str(end_time - start_time))