11.6修复bug
This commit is contained in:
parent
d4d1a14c06
commit
3bd548ea81
@ -1,8 +1,19 @@
|
|||||||
from flask_app.general.format_change import pdf2docx
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||||
from flask_app.general.通义千问long import upload_file
|
from flask_app.general.通义千问long import upload_file
|
||||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
|
||||||
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
|
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
|
||||||
|
|
||||||
|
def get_global_logger(unique_id):
|
||||||
|
if unique_id is None:
|
||||||
|
return logging.getLogger() # 获取默认的日志器
|
||||||
|
logger = logging.getLogger(unique_id)
|
||||||
|
return logger
|
||||||
|
|
||||||
|
|
||||||
|
logger = None
|
||||||
|
|
||||||
def extract_matching_keys(data_dict, good_list):
|
def extract_matching_keys(data_dict, good_list):
|
||||||
"""
|
"""
|
||||||
@ -31,23 +42,38 @@ def extract_matching_keys(data_dict, good_list):
|
|||||||
recurse(data_dict)
|
recurse(data_dict)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_technical_requirements_main(file_path,output_folder):
|
def get_technical_requirements_main(file_path,file_type,unique_id,output_folder):
|
||||||
truncate_file=truncate_pdf_main(file_path,output_folder,5)[0]
|
global logger
|
||||||
|
logger = get_global_logger(unique_id)
|
||||||
|
if file_type == 1: # docx
|
||||||
|
docx_path = file_path
|
||||||
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
||||||
|
elif file_type == 2: # pdf
|
||||||
|
pdf_path = file_path
|
||||||
|
elif file_type == 3: # doc
|
||||||
|
pdf_path = docx2pdf(file_path)
|
||||||
|
else:
|
||||||
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
|
return None
|
||||||
|
truncate_file=truncate_pdf_main(pdf_path,output_folder,5)[0]
|
||||||
if not truncate_file:
|
if not truncate_file:
|
||||||
truncate_file=file_path #直接传整份文件
|
truncate_file=pdf_path #直接传整份文件
|
||||||
truncate_file_docx=pdf2docx(truncate_file)
|
truncate_file_docx=pdf2docx(truncate_file)
|
||||||
file_id=upload_file(truncate_file_docx)
|
file_id=upload_file(truncate_file_docx)
|
||||||
# file_id=upload_file(truncate_file)
|
# file_id=upload_file(truncate_file)
|
||||||
final_res=get_technical_requirements(file_id)
|
final_res=get_technical_requirements(file_id,pdf_path)
|
||||||
# 安全地提取 "技术要求" 内部的字典内容
|
# 安全地提取 "技术要求" 内部的字典内容
|
||||||
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
|
if isinstance(final_res, dict) and '技术要求' in final_res and isinstance(final_res['技术要求'], dict):
|
||||||
technical_requirements = final_res['技术要求']
|
technical_requirements = final_res['技术要求']
|
||||||
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
|
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
|
||||||
|
print(good_list)
|
||||||
|
logger.info("Collected good_list from the processing function: %s", good_list)
|
||||||
return extract_matching_keys(technical_requirements,good_list)
|
return extract_matching_keys(technical_requirements,good_list)
|
||||||
else:
|
else:
|
||||||
return final_res
|
return final_res
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile.pdf"
|
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
|
||||||
|
file_type=2
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\tmp"
|
||||||
res=get_technical_requirements_main(file_path,output_folder)
|
res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
|
||||||
print(res)
|
print(json.dumps(res,ensure_ascii=False,indent=4))
|
||||||
|
@ -12,8 +12,10 @@ from flask_app.general.post_processing import outer_post_processing
|
|||||||
from flask_app.main.工程标解析main import engineering_bid_main
|
from flask_app.main.工程标解析main import engineering_bid_main
|
||||||
from flask_app.货物标.货物标解析main import goods_bid_main
|
from flask_app.货物标.货物标解析main import goods_bid_main
|
||||||
from flask_app.general.纯技术参数要求提取 import get_technical_requirements_main
|
from flask_app.general.纯技术参数要求提取 import get_technical_requirements_main
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
class CSTFormatter(logging.Formatter):
|
class CSTFormatter(logging.Formatter):
|
||||||
"""自定义的 Formatter,将日志的时间戳调整为中国标准时间(UTC+8)"""
|
"""自定义的 Formatter,将日志的时间戳调整为中国标准时间(UTC+8)"""
|
||||||
|
|
||||||
@ -156,11 +158,13 @@ def validate_request(default_zb_type=1):
|
|||||||
return jsonify({'error': 'Invalid zb_type provided'}), 400
|
return jsonify({'error': 'Invalid zb_type provided'}), 400
|
||||||
return file_url, zb_type
|
return file_url, zb_type
|
||||||
|
|
||||||
|
|
||||||
# 提取采购需求
|
# 提取采购需求
|
||||||
@app.route('/procurement_reqs', methods=['POST'])
|
@app.route('/procurement_reqs', methods=['POST'])
|
||||||
def get_procurement_reqs():
|
def get_procurement_reqs():
|
||||||
logger = g.logger
|
logger = g.logger
|
||||||
output_folder = g.output_folder
|
output_folder = g.output_folder
|
||||||
|
unique_id=g.unique_id
|
||||||
file_url, zb_type = validate_request()
|
file_url, zb_type = validate_request()
|
||||||
if isinstance(file_url, tuple): # Check if the returned value is an error response
|
if isinstance(file_url, tuple): # Check if the returned value is an error response
|
||||||
return file_url
|
return file_url
|
||||||
@ -174,7 +178,7 @@ def get_procurement_reqs():
|
|||||||
}), 400
|
}), 400
|
||||||
else:
|
else:
|
||||||
final_res_path = os.path.join(output_folder, 'final_result.json')
|
final_res_path = os.path.join(output_folder, 'final_result.json')
|
||||||
response = download_and_process_file_for_procurement(file_url)
|
response = download_and_process_file_for_procurement(file_url,unique_id)
|
||||||
try:
|
try:
|
||||||
with open(final_res_path, 'w', encoding='utf-8') as json_file:
|
with open(final_res_path, 'w', encoding='utf-8') as json_file:
|
||||||
json.dump(response, json_file, ensure_ascii=False, indent=4)
|
json.dump(response, json_file, ensure_ascii=False, indent=4)
|
||||||
@ -190,8 +194,9 @@ def get_procurement_reqs():
|
|||||||
logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
|
logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
|
||||||
return jsonify({'error': str(e)}), 500
|
return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
# 提取采购需求
|
# 提取采购需求
|
||||||
def download_and_process_file_for_procurement(file_url):
|
def download_and_process_file_for_procurement(file_url,unique_id):
|
||||||
"""
|
"""
|
||||||
下载并处理采购需求文件。
|
下载并处理采购需求文件。
|
||||||
|
|
||||||
@ -211,9 +216,10 @@ def download_and_process_file_for_procurement(file_url):
|
|||||||
logger.error("Unsupported file type or failed to download file")
|
logger.error("Unsupported file type or failed to download file")
|
||||||
return None
|
return None
|
||||||
logger.info("Local file path: " + downloaded_filepath)
|
logger.info("Local file path: " + downloaded_filepath)
|
||||||
res = get_technical_requirements_main(downloaded_filepath, output_folder)
|
res = get_technical_requirements_main(downloaded_filepath, file_type, unique_id,output_folder)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
@app.route('/little_zbparse', methods=['POST'])
|
@app.route('/little_zbparse', methods=['POST'])
|
||||||
def little_zbparse():
|
def little_zbparse():
|
||||||
logger = g.logger
|
logger = g.logger
|
||||||
@ -232,6 +238,7 @@ def little_zbparse():
|
|||||||
logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
|
logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
|
||||||
return jsonify({'error': str(e)}), 500
|
return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
def download_and_process_file(file_url, zb_type):
|
def download_and_process_file(file_url, zb_type):
|
||||||
"""
|
"""
|
||||||
下载并处理文件,根据zb_type选择处理函数。
|
下载并处理文件,根据zb_type选择处理函数。
|
||||||
@ -259,6 +266,7 @@ def download_and_process_file(file_url, zb_type):
|
|||||||
processed_file_path = little_parse_main(output_folder, downloaded_filepath, file_type, zb_type, g.unique_id)
|
processed_file_path = little_parse_main(output_folder, downloaded_filepath, file_type, zb_type, g.unique_id)
|
||||||
return processed_file_path
|
return processed_file_path
|
||||||
|
|
||||||
|
|
||||||
def generate_response(final_json_path):
|
def generate_response(final_json_path):
|
||||||
logger = g.logger
|
logger = g.logger
|
||||||
# 检查final_json_path是否为空或None
|
# 检查final_json_path是否为空或None
|
||||||
@ -297,6 +305,7 @@ def zbparse():
|
|||||||
logger.error('Exception occurred: ' + str(e))
|
logger.error('Exception occurred: ' + str(e))
|
||||||
return jsonify({'error': str(e)}), 500
|
return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
# 分段返回
|
# 分段返回
|
||||||
def process_and_stream(file_url, zb_type):
|
def process_and_stream(file_url, zb_type):
|
||||||
"""
|
"""
|
||||||
@ -394,7 +403,8 @@ def process_and_stream(file_url, zb_type):
|
|||||||
includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
||||||
final_result, extracted_info, procurement_reqs = outer_post_processing(combined_data, includes, good_list)
|
final_result, extracted_info, procurement_reqs = outer_post_processing(combined_data, includes, good_list)
|
||||||
|
|
||||||
logger.info(f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}") # 添加日志记录
|
logger.info(
|
||||||
|
f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}") # 添加日志记录
|
||||||
# 采购需求
|
# 采购需求
|
||||||
procurement_reqs_response = {
|
procurement_reqs_response = {
|
||||||
'message': 'procurement_reqs',
|
'message': 'procurement_reqs',
|
||||||
|
@ -347,7 +347,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()()]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()()]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
),
|
),
|
||||||
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
output_suffix = "evaluation_method"
|
output_suffix = "evaluation_method"
|
||||||
@ -364,11 +364,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
# ),
|
# ),
|
||||||
(
|
(
|
||||||
re.compile(
|
re.compile(
|
||||||
r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]*资格[\u4e00-\u9fff]*\s*$',
|
r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*资格[\u4e00-\u9fff、()()]*\s*$',
|
||||||
re.MULTILINE),
|
re.MULTILINE),
|
||||||
re.compile(
|
re.compile(
|
||||||
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*|'
|
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*|'
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
output_suffix = "qualification"
|
output_suffix = "qualification"
|
||||||
|
@ -158,7 +158,7 @@ def get_base_info(merged_baseinfo_path,clause_path):
|
|||||||
# baseinfo_list.append(clean_json_string(response))
|
# baseinfo_list.append(clean_json_string(response))
|
||||||
return baseinfo_list
|
return baseinfo_list
|
||||||
|
|
||||||
def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path):
|
def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path,invalid_path):
|
||||||
baseinfo_list = []
|
baseinfo_list = []
|
||||||
temp_list = []
|
temp_list = []
|
||||||
procurement_reqs = {}
|
procurement_reqs = {}
|
||||||
@ -169,7 +169,7 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpat
|
|||||||
# 定义一个线程函数来获取采购需求
|
# 定义一个线程函数来获取采购需求
|
||||||
def fetch_procurement_reqs_thread():
|
def fetch_procurement_reqs_thread():
|
||||||
nonlocal procurement_reqs
|
nonlocal procurement_reqs
|
||||||
procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath)
|
procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path)
|
||||||
# 创建并启动获取基础信息的线程
|
# 创建并启动获取基础信息的线程
|
||||||
thread1 = threading.Thread(target=get_base_info_thread)
|
thread1 = threading.Thread(target=get_base_info_thread)
|
||||||
thread1.start()
|
thread1.start()
|
||||||
|
@ -150,13 +150,17 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
|
|
||||||
|
|
||||||
def get_patterns_for_procurement():
|
def get_patterns_for_procurement():
|
||||||
begin_pattern = re.compile(
|
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*',
|
|
||||||
re.MULTILINE)
|
|
||||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
|
||||||
# begin_pattern = re.compile(
|
# begin_pattern = re.compile(
|
||||||
# r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(?:服务|项目|商务|技术)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$'
|
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
|
||||||
# )
|
# re.MULTILINE)
|
||||||
|
begin_pattern = re.compile(
|
||||||
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||||
|
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符
|
||||||
|
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
|
||||||
|
r'(?:采购|需求)[\u4e00-\u9fff、()()]*?)\s*$', # 或者匹配“采购”或“需求”
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
|
|
||||||
@ -168,13 +172,13 @@ def get_patterns_for_evaluation_method():
|
|||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||||
r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号)
|
r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号)
|
||||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
|
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。
|
||||||
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
||||||
r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾
|
r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
def get_patterns_for_qualification():
|
def get_patterns_for_qualification():
|
||||||
@ -381,7 +385,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
new_end_pattern = re.compile(
|
new_end_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
print("第三次尝试 tobidders_notice! ")
|
print("第三次尝试 tobidders_notice! ")
|
||||||
@ -446,10 +450,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
|
|
||||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',re.MULTILINE # 捕获中文部分
|
||||||
)
|
)
|
||||||
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
||||||
|
|
||||||
@ -654,7 +658,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
|||||||
# 根据选择设置对应的模式和结束模式
|
# 根据选择设置对应的模式和结束模式
|
||||||
if selection == 1:
|
if selection == 1:
|
||||||
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
||||||
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
|
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
local_output_suffix = "notice"
|
local_output_suffix = "notice"
|
||||||
elif selection == 2:
|
elif selection == 2:
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
@ -673,7 +677,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
|||||||
local_output_suffix = "tobidders_notice"
|
local_output_suffix = "tobidders_notice"
|
||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*')
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') #包头中有一章'采购相关说明'
|
||||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||||
local_output_suffix = "procurement"
|
local_output_suffix = "procurement"
|
||||||
|
|
||||||
@ -799,7 +803,7 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
|||||||
|
|
||||||
#ztbfile.pdf少资格评审 包头少符合性评审
|
#ztbfile.pdf少资格评审 包头少符合性评审
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
|
@ -89,9 +89,14 @@ def postprocess(data):
|
|||||||
# 递归处理顶层数据
|
# 递归处理顶层数据
|
||||||
return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
|
return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
|
||||||
|
|
||||||
def get_technical_requirements(file_id):
|
def get_technical_requirements(file_id,invalid_path):
|
||||||
|
first_query="该文档中是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'"
|
||||||
|
judge_res=qianwen_long(file_id,first_query)
|
||||||
|
print(judge_res)
|
||||||
|
if '否' in judge_res:
|
||||||
|
file_id=upload_file(invalid_path)
|
||||||
user_query1 = """
|
user_query1 = """
|
||||||
这是一份货物标中采购要求部分的内容,请告诉我需要采购的货物,如果有采购清单,请直接根据清单上的货物(或系统)名称给出结果,若没有采购清单,你要从表格中或文中摘取需要采购的系统(或货物),采购需求中可能包含层次关系,例如采购的某大系统中可能包含几种货物,那么你需要用嵌套键值对表示这种关系,且不要遗漏该系统中包含的货物,你的输出请以json格式返回,最外层键名为'采购需求',嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填'未知'。以下为示例输出:
|
请你首先定位该采购文件中的采购清单或采购需求部分,请告诉我需要采购的货物,如果有采购清单,请直接根据清单上的货物(或系统)名称给出结果,若没有采购清单,你要从表格中或文中摘取需要采购的系统(或货物),采购需求中可能包含层次关系,例如采购的某大系统中可能包含几种货物,那么你需要用嵌套键值对表示这种关系,且不要遗漏该系统中包含的货物,你的输出请以json格式返回,最外层键名为'采购需求',嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填'未知'。以下为示例输出:
|
||||||
{
|
{
|
||||||
"采购需求": {
|
"采购需求": {
|
||||||
"门禁管理系统": {},
|
"门禁管理系统": {},
|
||||||
@ -105,9 +110,8 @@ def get_technical_requirements(file_id):
|
|||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
res = qianwen_long(file_id, user_query1)
|
res = qianwen_long(file_id, user_query1)
|
||||||
# print(res)
|
print(res)
|
||||||
cleaned_res = clean_json_string(res) #转字典
|
cleaned_res = clean_json_string(res) #转字典
|
||||||
# print(res)
|
|
||||||
keys_list,good_list,no_keys_added= generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
|
keys_list,good_list,no_keys_added= generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
|
||||||
if '采购需求' in cleaned_res:
|
if '采购需求' in cleaned_res:
|
||||||
cleaned_res['技术要求'] = cleaned_res.pop('采购需求')
|
cleaned_res['技术要求'] = cleaned_res.pop('采购需求')
|
||||||
@ -172,14 +176,17 @@ def test_all_files_in_folder(input_folder, output_folder):
|
|||||||
print(f"处理文件 {file_path} 时出错: {e}")
|
print(f"处理文件 {file_path} 时出错: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
|
# # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
|
||||||
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
|
# # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(107国道).docx"
|
||||||
truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
|
# invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
|
# truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
|
||||||
file_id = upload_file(truncate_file)
|
# output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
|
||||||
res=get_technical_requirements(file_id)
|
# file_id = upload_file(truncate_file)
|
||||||
|
invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
|
||||||
|
file_id="file-fe-FcOjv4FiOGjHRG1pKaFrIBeG"
|
||||||
|
res=get_technical_requirements(file_id,invalid_path)
|
||||||
json_string = json.dumps(res, ensure_ascii=False, indent=4)
|
json_string = json.dumps(res, ensure_ascii=False, indent=4)
|
||||||
print(json_string)
|
print(json_string)
|
||||||
# input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
# # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
# # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
||||||
# test_all_files_in_folder(input_folder, output_folder)
|
# # test_all_files_in_folder(input_folder, output_folder)
|
@ -7,7 +7,7 @@ from flask_app.货物标.商务服务其他要求提取 import get_business_requ
|
|||||||
|
|
||||||
|
|
||||||
#获取采购清单
|
#获取采购清单
|
||||||
def fetch_procurement_reqs(procurement_path,procurement_docpath):
|
def fetch_procurement_reqs(procurement_path,procurement_docpath,invalid_path):
|
||||||
# 定义默认的 procurement_reqs 字典
|
# 定义默认的 procurement_reqs 字典
|
||||||
DEFAULT_PROCUREMENT_REQS = {
|
DEFAULT_PROCUREMENT_REQS = {
|
||||||
"技术要求": "",
|
"技术要求": "",
|
||||||
@ -25,8 +25,8 @@ def fetch_procurement_reqs(procurement_path,procurement_docpath):
|
|||||||
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
|
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
# 提交任务给线程池
|
# 提交任务给线程池
|
||||||
future_technical = executor.submit(get_technical_requirements, file_id)
|
future_technical = executor.submit(get_technical_requirements, file_id,invalid_path)
|
||||||
time.sleep(1) # 如果需要延迟,可以保留,否则建议移除以提高效率
|
time.sleep(0.5)
|
||||||
future_business = executor.submit(get_business_requirements, procurement_path, file_id)
|
future_business = executor.submit(get_business_requirements, procurement_path, file_id)
|
||||||
|
|
||||||
# 获取并行任务的结果
|
# 获取并行任务的结果
|
||||||
|
@ -90,7 +90,7 @@ def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path,
|
|||||||
merged_baseinfo_path = invalid_path
|
merged_baseinfo_path = invalid_path
|
||||||
if not procurement_docpath:
|
if not procurement_docpath:
|
||||||
procurement_docpath=invalid_docpath
|
procurement_docpath=invalid_docpath
|
||||||
basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path)
|
basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path,invalid_path)
|
||||||
base_info, good_list = post_process_baseinfo(basic_res)
|
base_info, good_list = post_process_baseinfo(basic_res)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
logger.info(f"基础信息 done,耗时:{end_time - start_time:.2f} 秒")
|
logger.info(f"基础信息 done,耗时:{end_time - start_time:.2f} 秒")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user