10.18小解析

This commit is contained in:
zy123 2024-10-18 18:06:23 +08:00
parent d06d6a145f
commit 861a54a378
9 changed files with 135 additions and 42 deletions

View File

@ -48,6 +48,15 @@ def pdf2docx(local_path_in):
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath return downloaded_filepath
def doc2docx(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
receive_download_url = upload_file(local_path_in, remote_url)
print(receive_download_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def docx2pdf(local_path_in): def docx2pdf(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = upload_file(local_path_in, remote_url) receive_download_url = upload_file(local_path_in, remote_url)
@ -60,8 +69,8 @@ def docx2pdf(local_path_in):
if __name__ == '__main__': if __name__ == '__main__':
# 替换为你的文件路径和API URL # 替换为你的文件路径和API URL
local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).doc" local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).doc"
# pdf2docx(local_path_in) downloaded_file=doc2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in) # downloaded_file=docx2pdf(local_path_in)
print(downloaded_file) print(downloaded_file)

View File

@ -1,22 +1,51 @@
import json import json
import re import re
def extract_content_from_json(string): def extract_content_from_json(string):
"""输入字符串,提取 { 和 } 之间的内容,并将其解析为字典""" """
输入字符串提取 { } 之间的内容并将其解析为字典
如果使用 insert_missing_commas 修复后仍然失败则尝试返回原始的解析结果如果可能
"""
if not string.strip(): if not string.strip():
return {} return {}
# 提取第一个匹配的 JSON 对象
match = re.search(r'\{[\s\S]*\}', string) match = re.search(r'\{[\s\S]*\}', string)
if match: if match:
json_data = match.group(0)
try: try:
json_data = match.group(0) # 尝试直接解析原始 JSON 数据
return json.loads(json_data) #返回字典 return json.loads(json_data)
except json.JSONDecodeError as e: except json.JSONDecodeError as original_error:
print(f"json_utils: extract_content_from_json: JSON decode error: {e}") print(f"原始 JSON 解析失败: {original_error}")
return {} try:
# 尝试修复缺失的逗号
fixed_json = insert_missing_commas(json_data)
return json.loads(fixed_json)
except json.JSONDecodeError as fixed_error:
print(f"修复后的 JSON 解析失败: {fixed_error}")
# 可选:返回空字典或其他默认值
return {}
else: else:
print("json_utils: extract_content_from_json: No valid JSON content found.") print("json_utils: extract_content_from_json: 未找到有效的 JSON 内容。")
return {} return {}
def insert_missing_commas(json_str):
"""
使用正则表达式在缺失逗号的位置插入逗号
具体来说寻找一个值的结束引号后紧跟着下一个键的开始引号并在中间插入逗号
"""
# 这个正则匹配一个字符串结尾的引号,可能有空白字符,然后是另一个键的引号
pattern = r'(":\s*"[^"]*)"\s*(")'
replacement = r'\1", \2'
previous_str = None
while previous_str != json_str:
previous_str = json_str
json_str = re.sub(pattern, replacement, json_str)
return json_str
def clean_json_string(json_string): def clean_json_string(json_string):
"""清理JSON字符串移除多余的反引号并解析为字典""" """清理JSON字符串移除多余的反引号并解析为字典"""
return extract_content_from_json(json_string) return extract_content_from_json(json_string)

View File

@ -43,20 +43,30 @@ def aggregate_basic_info_engineering(baseinfo_list):
# 合并所有基础信息并收集相关键 # 合并所有基础信息并收集相关键
for baseinfo in baseinfo_list: for baseinfo in baseinfo_list:
# json_data = clean_json_string(baseinfo)
combined_data.update(baseinfo) combined_data.update(baseinfo)
relevant_keys_detected.update(baseinfo.keys()) relevant_keys_detected.update(baseinfo.keys())
# 动态调整键组 # 动态调整键组
dynamic_key_handling(key_groups, relevant_keys_detected) dynamic_key_handling(key_groups, relevant_keys_detected)
# 创建一个副本以存储未分类的项目
unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]}
# 按键组分类并嵌套 # 按键组分类并嵌套
for group_name, keys in key_groups.items(): for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys} group_data = {key: combined_data.get(key, "未提供") for key in keys}
combined_data[group_name] = group_data combined_data[group_name] = group_data
# Optionally remove original keys to avoid duplication # 从 unclassified_items 中移除已分类的键
for key in keys: for key in keys:
combined_data.pop(key, None) unclassified_items.pop(key, None)
# 将剩余未分类的键值对添加到 "其他信息" 组
combined_data["其他信息"].update(unclassified_items)
# 移除顶层的未分类键值对
for key in list(combined_data.keys()):
if key not in key_groups:
del combined_data[key]
return combined_data return combined_data

View File

@ -43,20 +43,30 @@ def aggregate_basic_info_engineering(baseinfo_list):
# 合并所有基础信息并收集相关键 # 合并所有基础信息并收集相关键
for baseinfo in baseinfo_list: for baseinfo in baseinfo_list:
# json_data = clean_json_string(baseinfo)
combined_data.update(baseinfo) combined_data.update(baseinfo)
relevant_keys_detected.update(baseinfo.keys()) relevant_keys_detected.update(baseinfo.keys())
# 动态调整键组 # 动态调整键组
dynamic_key_handling(key_groups, relevant_keys_detected) dynamic_key_handling(key_groups, relevant_keys_detected)
# 创建一个副本以存储未分类的项目
unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]}
# 按键组分类并嵌套 # 按键组分类并嵌套
for group_name, keys in key_groups.items(): for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys} group_data = {key: combined_data.get(key, "未提供") for key in keys}
combined_data[group_name] = group_data combined_data[group_name] = group_data
# Optionally remove original keys to avoid duplication # 从 unclassified_items 中移除已分类的键
for key in keys: for key in keys:
combined_data.pop(key, None) unclassified_items.pop(key, None)
# 将剩余未分类的键值对添加到 "其他信息" 组
combined_data["其他信息"].update(unclassified_items)
# 移除顶层的未分类键值对
for key in list(combined_data.keys()):
if key not in key_groups:
del combined_data[key]
return combined_data return combined_data
@ -113,27 +123,22 @@ def combine_basic_info(merged_baseinfo_path,truncate0, output_folder, clause_pat
返回 返回
- dict: 综合后的基础信息 - dict: 综合后的基础信息
""" """
baseinfo_list = [] # baseinfo_prompt_file_path='flask_app/static/提示词/基本信息工程标qianwen-long.txt'
baseinfo_file_path = 'flask_app/static/提示词/基本信息工程标.txt' baseinfo_prompt_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标qianwen-long.txt'
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标.txt' questions = read_questions_from_file(baseinfo_prompt_file_path)
questions = read_questions_from_file(baseinfo_file_path) file_id = upload_file(merged_baseinfo_path)
res1 = multi_threading(questions, knowledge_name) baseinfo_results = multi_threading(questions, "", file_id, 2)
# 清理 JSON 字符串
for index, response in res1: baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
try: for i in baseinfo_list:
if response and len(response) > 1: print(json.dumps(i,ensure_ascii=False,indent=4))
baseinfo_list.append(clean_json_string(response[1]))
else:
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.")
except Exception as e:
print(f"基础信息整合: Error processing response for query index {index}: {e}")
# 判断是否分包、是否需要递交投标保证金等 # 判断是否分包、是否需要递交投标保证金等
chosen_numbers, merged = judge_whether_main(truncate0, output_folder) chosen_numbers, merged = judge_whether_main(truncate0, output_folder)
baseinfo_list.append(merged) baseinfo_list.append(merged)
judge_file_path = 'flask_app/static/提示词/是否相关问题.txt' # judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
# judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt' judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt'
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标 judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标
@ -159,11 +164,12 @@ def combine_basic_info(merged_baseinfo_path,truncate0, output_folder, clause_pat
aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list) # 现在是一个字典 aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list) # 现在是一个字典
return {"基础信息": aggregated_baseinfo} return {"基础信息": aggregated_baseinfo}
#TODO:先不带投标人须知正文,如果是未知,再直接问正文,
if __name__ == "__main__": if __name__ == "__main__":
merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_merged_baseinfo.pdf" merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_merged_baseinfo.pdf"
output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405" output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf" truncate0="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice_table.pdf"
clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json" clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\clause1.json"
res=combine_basic_info(merged_baseinfo_path,truncate0,output_folder,clause_path) res=combine_basic_info(merged_baseinfo_path,truncate0,output_folder,clause_path)
print(json.dumps(res,ensure_ascii=False,indent=4)) print(json.dumps(res,ensure_ascii=False,indent=4))

View File

@ -230,14 +230,14 @@ def convert_clause_to_json(input_path,output_folder,type=1):
# json.dump(processed_data, file, ensure_ascii=False, indent=4) # json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__": if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标01' file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf' # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf'
# start_word = "投标人须知正文" # start_word = "投标人须知正文"
# end_phrases = [ # end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', # r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:', # r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ] # ]
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\test3\\tmp' output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\special_output'
try: try:
output_path = convert_clause_to_json(file_path,output_folder) output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}") print(f"Final JSON result saved to: {output_path}")

View File

@ -42,8 +42,8 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
logger.error("Unsupported file type provided. Preprocessing halted.") logger.error("Unsupported file type provided. Preprocessing halted.")
return None return None
# 异步上传知识库 # # 异步上传知识库
future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id) # future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id)
# 调用截取PDF多次 # 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) truncate_files = truncate_pdf_multiple(pdf_path, output_folder)
@ -55,6 +55,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
truncate0 = truncate_files[0] truncate0 = truncate_files[0]
truncate1 = truncate_files[1] truncate1 = truncate_files[1]
truncate3 = truncate_files[3] truncate3 = truncate_files[3]
merged_baseinfo_path=truncate_files[-1]
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
logger.info("文件预处理done") logger.info("文件预处理done")
@ -66,8 +67,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
'truncate0': truncate0, 'truncate0': truncate0,
'truncate1': truncate1, 'truncate1': truncate1,
'truncate3': truncate3, 'truncate3': truncate3,
'knowledge_future': future_knowledge, # 返回 Future 对象 # 'knowledge_future': future_knowledge, # 返回 Future 对象
'truncate0_jsonpath': truncate_jsonpath, 'truncate0_jsonpath': truncate_jsonpath,
'merged_baseinfo_path':merged_baseinfo_path,
'clause_path': clause_path, 'clause_path': clause_path,
'invalid_docpath': invalid_docpath 'invalid_docpath': invalid_docpath
} }

View File

@ -2,7 +2,7 @@
2.该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'项目概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。 2.该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'项目概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
3.该招标文件的招标控制价(可指代投标限价、投资概算金额、工程概算金额、合同估算价但非监理费用请按json格式给我提供信息键名为'招标控制价',若存在未知信息,在对应的键值中填'未知'。 3.该招标文件的招标控制价(或投标限价或预算金额或合同估算价但非监理费用请按json格式给我提供信息键名为'招标控制价',若存在未知信息,在对应的键值中填'未知'。
4.投标文件递交截止日期是递交方式是请按json格式给我提供信息键名分别是'投标文件递交截止日期','投标文件递交方式',若存在未知信息,在对应的键值中填'未知'。 4.投标文件递交截止日期是递交方式是请按json格式给我提供信息键名分别是'投标文件递交截止日期','投标文件递交方式',若存在未知信息,在对应的键值中填'未知'。

View File

@ -0,0 +1,25 @@
1.该招标文件的项目名称或工程名称招标编号或项目编号招标人是招标代理机构是请按json格式给我提供信息键名分别是'项目名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。
2.该招标文件的项目概况或工程概况招标范围是请按json格式给我提供信息键名分别为'项目概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。
3.该招标文件的招标控制价或投标限价或预算金额或合同估算价但非监理费用请按json格式给我提供信息键名为'招标控制价',若存在未知信息,在对应的键值中填'未知'。
4.投标文件递交截止日期是递交方式是请按json格式给我提供信息键名分别是'投标文件递交截止日期','投标文件递交方式',若存在未知信息,在对应的键值中填'未知'。
5.招标人和招标代理机构的联系方式是请按json格式给我提供信息键名分别是'招标人联系方式''招标代理机构联系方式',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
6.该招标文件的评标结果或相关信息公示媒介在哪请按json格式给我提供信息键名是'评标结果公示媒介',若存在未知信息,在对应的键值中填'未知'。
7.该招标文件的投标竞争下浮率是多少请按json格式给我提供信息键名是'投标竞争下浮率',若存在未知信息,在对应的键值中填'未知'。
8.该项目的投标有效期是多久或自投标截止之日起多久之内有效请按json格式给我提供信息键名是'投标有效期',键值为文中相关表述,不得擅自总结、删减,若存在未知信息,在对应的键值中填'未知'。
9.该招标文件中对投标人准备和参加投标活动发生的费用是如何规定的请以json的格式给我提供信息键名是'投标费用承担',若存在未知信息,在对应的键值中填'未知'。
10.投标人要求澄清招标文件的截止时间是请以json的格式给我提供信息键名是'投标人要求澄清招标文件的截止时间',若存在未知信息,在对应的键值中填'未知'。
11.该文档要求扣留的质量保证金百分比是多少请以json格式给我提供信息键名为'质量保证金',如果没有则以'未知'填充。
12.该项目是否接受联合体投标请按json格式给我提供信息键名为'是否接受联合体投标''是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。
13.该项目的开标时间或开启时间和开标地点是请按json格式给我提供信息键名为'开标时间'和'开标地点',对于"开标时间",若文中没有明确时间,将其键值设为文中相关表述,若存在未知信息,在对应的键值中填'未知'。

View File

@ -52,13 +52,25 @@ def aggregate_basic_info_goods(baseinfo_list):
# 动态调整键组 # 动态调整键组
dynamic_key_handling(key_groups, relevant_keys_detected) dynamic_key_handling(key_groups, relevant_keys_detected)
# 创建一个副本以存储未分类的项目
unclassified_items = {k: v for k, v in combined_data.items() if
k not in [item for sublist in key_groups.values() for item in sublist]}
# 按键组分类并嵌套 # 按键组分类并嵌套
for group_name, keys in key_groups.items(): for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys} group_data = {key: combined_data.get(key, "未提供") for key in keys}
combined_data[group_name] = group_data combined_data[group_name] = group_data
# Optionally remove original keys to avoid duplication # 从 unclassified_items 中移除已分类的键
for key in keys: for key in keys:
combined_data.pop(key, None) unclassified_items.pop(key, None)
# 将剩余未分类的键值对添加到 "其他信息" 组中
combined_data["其他信息"].update(unclassified_items)
# 移除顶层的未分类键值对
for key in list(combined_data.keys()):
if key not in key_groups:
del combined_data[key]
return combined_data return combined_data