zbparse/flask_app/货物标/货物标解析main.py
2024-10-16 20:18:55 +08:00

224 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 竞磋 竞谈 磋商 询价 邀请 单一来源
import json
import time
from flask_app.main.format_change import docx2pdf, pdf2docx
from flask_app.main.json_utils import transform_json_values
from flask_app.货物标.基础信息解析main import combine_basic_info
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
from flask_app.货物标.货物标截取pdf import truncate_pdf_multiple
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
from flask_app.货物标.无效标和废标和禁止投标整合main import combine_find_invalid
from flask_app.货物标.资格审查main import combine_qualification_review
from flask_app.货物标.评分标准提取main import combine_evaluation_standards
import logging
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
# 创建全局线程池
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, file_path, file_type, unique_id):
logger.info("starting 文件预处理...")
logger.info("output_folder..." + output_folder)
# 根据文件类型处理文件路径
if file_type == 1: # docx
docx_path = file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
docx_path = pdf2docx(pdf_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# # 异步上传知识库
# future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id)
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path,
output_folder) # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
# 处理各个部分
invalid_docpath = docx_path # docx截取无效标部分
procurement_path = truncate_files[0] # 商务技术服务要求
evaluation_method_path = truncate_files[1] # 评标办法
qualification_path = truncate_files[2] # 资格审查
tobidders_notice_path = truncate_files[4] # 投标人须知正文
notice_path = truncate_files[5]
merged_baseinfo_path = truncate_files[6] # 合并封面+招标公告+投标人须知前附表+须知正文
clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json
logger.info("文件预处理done")
# 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象
return {
'file_path': file_path,
'output_folder': output_folder,
'procurement_path': procurement_path,
'evaluation_method_path': evaluation_method_path,
'qualification_path': qualification_path,
'notice_path': notice_path,
# 'knowledge_future': future_knowledge, # 返回 Future 对象
'clause_path': clause_path,
'invalid_docpath': invalid_docpath,
'merged_baseinfo_path': merged_baseinfo_path
}
def fetch_project_basic_info(merged_baseinfo_path, procurement_file_path): # 投标人须知前附表
logger.info("starting基础信息...")
basic_res = combine_basic_info(merged_baseinfo_path, procurement_file_path)
logger.info("基础信息done")
return basic_res
def fetch_qualification_review(output_folder, qualification_path, notice_path,merged_baseinfo_path): # 资格审查
logger.info("starting资格审查...")
review_standards_res = combine_qualification_review(output_folder, qualification_path, notice_path,merged_baseinfo_path)
logger.info("资格审查done")
return review_standards_res
def fetch_evaluation_standards(evaluation_method_path): # 评标细则
logger.info("starting 商务评分和技术评分...")
# 获取评标办法前附表的字典结果
evaluation_standards_res = combine_evaluation_standards(evaluation_method_path)
# 获取技术标和商务标
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
logger.info("商务评分和技术评分done")
# 返回将 "技术标" 和 "商务标" 包含在新的键中
return {
"technical_standards": technical_standards,
"commercial_standards": commercial_standards
}
# TODO:doc文档转换
def fetch_invalid_requirements(invalid_docpath, output_folder):
# 废标项要求:千问
logger.info("starting无效标与废标...")
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder)
logger.info("无效标与废标done...")
return find_invalid_res
# 投标文件要求
def fetch_bidding_documents_requirements(clause_path):
logger.info("starting投标文件要求...")
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
logger.info("投标文件要求done...")
return {"投标文件要求": fetch_bidding_documents_requirements_json}
# 开评定标流程
def fetch_bid_opening(clause_path):
logger.info("starting开评定标流程...")
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
logger.info("开评定标流程done...")
return {"开评定标流程": fetch_bid_opening_json}
def goods_bid_main(output_folder, file_path, file_type, unique_id):
global logger
logger = get_global_logger(unique_id)
# 预处理文件,获取处理后的数据
processed_data = preprocess_files(output_folder, file_path, file_type, unique_id)
if not processed_data:
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
with concurrent.futures.ThreadPoolExecutor() as executor:
# 立即启动不依赖 knowledge_name 和 index 的任务
futures = {
'evaluation_standards': executor.submit(fetch_evaluation_standards,
processed_data['evaluation_method_path']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']),
'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'],
processed_data['procurement_path']),
'qualification_review': executor.submit(fetch_qualification_review, output_folder,
processed_data['qualification_path'],
processed_data['notice_path'],
processed_data['merged_baseinfo_path']),
}
# 提前处理这些不依赖的任务,按完成顺序返回
for future in concurrent.futures.as_completed(futures.values()):
key = next(k for k, v in futures.items() if v == future)
try:
result = future.result()
# 如果是 evaluation_standards拆分技术标和商务标
if key == 'evaluation_standards':
technical_standards = result["technical_standards"]
commercial_standards = result["commercial_standards"]
# 分别返回技术标和商务标
yield json.dumps({'technical_standards': transform_json_values(technical_standards)},
ensure_ascii=False)
yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)},
ensure_ascii=False)
else:
# 处理其他任务的结果
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
except Exception as exc:
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
# # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成
# try:
# knowledge_name = "招标解析" + unique_id
# index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成
#
# # 提交依赖 knowledge_name 和 index 的任务
# future_dependencies = {
# 'base_info': executor.submit(fetch_project_basic_info,processed_data['merged_baseinfo_path'], processed_data['procurement_path']),
# 'qualification_review': executor.submit(fetch_qualification_review, output_folder,
# processed_data['qualification_path'],
# processed_data['notice_path'], processed_data['merged_baseinfo_path']),
# }
# # 按完成顺序返回依赖任务的结果
# for future in concurrent.futures.as_completed(future_dependencies.values()):
# key = next(k for k, v in future_dependencies.items() if v == future)
# try:
# result = future.result()
# yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
# except Exception as exc:
# logger.error(f"Error processing {key}: {exc}")
# yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
#
# except Exception as e:
# logger.error(f"Error uploading to knowledge base: {e}")
# yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False)
# 删除知识索引
# deleteKnowledge(index)
#TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格
if __name__ == "__main__":
output_folder = "flask_app/static/output/zytest1"
start_time = time.time()
file_type = 1 # 1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf"
goods_bid_main(output_folder, input_file, file_type, "uuidzyzy11")
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")