9.6部分文件引用了招标公告中的内容,添加了对二次跳转的处理
This commit is contained in:
parent
3f79900ed4
commit
04b28a6028
@ -1,3 +1,5 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@ -57,6 +59,7 @@ def read_tables_from_docx(file_path):
|
|||||||
|
|
||||||
return table_list
|
return table_list
|
||||||
|
|
||||||
|
|
||||||
def flatten_nested_dicts(d):
|
def flatten_nested_dicts(d):
|
||||||
"""平坦化嵌套字典,以便更简洁地保存为JSON."""
|
"""平坦化嵌套字典,以便更简洁地保存为JSON."""
|
||||||
keys_to_remove = []
|
keys_to_remove = []
|
||||||
@ -77,12 +80,21 @@ def flatten_nested_dicts(d):
|
|||||||
|
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def save_data_to_json(data, filename):
|
|
||||||
"""将数据保存到JSON文件中."""
|
|
||||||
with open(filename, 'w', encoding='utf-8') as file:
|
|
||||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
|
||||||
|
|
||||||
def extract_tables_main(path, output_filename):
|
def save_data_to_json(data, output_folder):
|
||||||
|
filename = "truncate_output.json"
|
||||||
|
output_filepath = os.path.join(output_folder, filename)
|
||||||
|
"""将数据保存到JSON文件中."""
|
||||||
|
with open(output_filepath, 'w', encoding='utf-8') as file:
|
||||||
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
|
print(f"The data has been processed and saved to '{output_filepath}'.")
|
||||||
|
return output_filepath
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tables_main(path, output_folder):
|
||||||
|
if not os.path.exists(path):
|
||||||
|
print(f"The specified file does not exist: {path}")
|
||||||
|
return ""
|
||||||
# 读取文档表格数据
|
# 读取文档表格数据
|
||||||
table_data = read_tables_from_docx(path)
|
table_data = read_tables_from_docx(path)
|
||||||
|
|
||||||
@ -90,11 +102,10 @@ def extract_tables_main(path, output_filename):
|
|||||||
flattened_data = flatten_nested_dicts(table_data)
|
flattened_data = flatten_nested_dicts(table_data)
|
||||||
|
|
||||||
# 保存平坦化后的数据到JSON文件
|
# 保存平坦化后的数据到JSON文件
|
||||||
save_data_to_json(flattened_data, output_filename)
|
return save_data_to_json(flattened_data, output_folder)
|
||||||
|
|
||||||
print(f"The data has been processed and saved to '{output_filename}'.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\zbtest20_17-22.docx'
|
path = ''
|
||||||
output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\truncate_output.json" # 前附表json文件
|
output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\truncate_output.json" # 前附表json文件
|
||||||
extract_tables_main(path, output_filename)
|
extract_tables_main(path, output_filename)
|
||||||
|
@ -81,8 +81,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
# 调用大模型回答项目基础信息
|
# 调用大模型回答项目基础信息
|
||||||
print("starting基础信息...")
|
print("starting基础信息...")
|
||||||
baseinfo_list = []
|
baseinfo_list = []
|
||||||
baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
||||||
# baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
||||||
questions = read_questions_from_file(baseinfo_file_path)
|
questions = read_questions_from_file(baseinfo_file_path)
|
||||||
res1 = multi_threading(questions, knowledge_name)
|
res1 = multi_threading(questions, knowledge_name)
|
||||||
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||||
@ -97,11 +97,10 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
# 判断是否分包、是否需要递交投标保证金等
|
# 判断是否分包、是否需要递交投标保证金等
|
||||||
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
||||||
baseinfo_list.append(merged)
|
baseinfo_list.append(merged)
|
||||||
judge_file_path = '../static/提示词/是否相关问题.txt'
|
# judge_file_path = '../static/提示词/是否相关问题.txt'
|
||||||
# judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
||||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||||
|
|
||||||
|
|
||||||
judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标
|
judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标
|
||||||
if judge_consortium:
|
if judge_consortium:
|
||||||
judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
|
judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
|
||||||
|
@ -125,7 +125,6 @@ def extract_matching_keys(json_data):
|
|||||||
|
|
||||||
return final_matching
|
return final_matching
|
||||||
|
|
||||||
#TODO:如果要引用到招标公告中的内容,考虑提取 或者qianwen-long
|
|
||||||
def reformat_questions(match_keys, input_path, output_folder):
|
def reformat_questions(match_keys, input_path, output_folder):
|
||||||
entries_with_numbers = []
|
entries_with_numbers = []
|
||||||
entries_with_numbers2 = [] # 用于保存特别处理的条目
|
entries_with_numbers2 = [] # 用于保存特别处理的条目
|
||||||
@ -176,11 +175,11 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
|
|||||||
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
|
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
|
||||||
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
|
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
|
||||||
clause_json_path) # 调用根据条款号整合json.py
|
clause_json_path) # 调用根据条款号整合json.py
|
||||||
combined_results1, formatted_questions2 = judge_second_jump(combined_results)
|
combined_results1, formatted_questions2 = judge_second_jump(combined_results) #判断是否需要二次跳转
|
||||||
if entries_with_numbers2: #跳转第一章招标公告
|
if entries_with_numbers2: #跳转第一章招标公告
|
||||||
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
|
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
|
||||||
|
|
||||||
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name, True) #无序号的直接问大模型
|
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name) #无序号的直接问大模型
|
||||||
first_response_list = []
|
first_response_list = []
|
||||||
for _, response in results_2:
|
for _, response in results_2:
|
||||||
try:
|
try:
|
||||||
|
@ -152,7 +152,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
# 确保找到了起始和结束页面
|
# 确保找到了起始和结束页面
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
if output_suffix == "qualification" or output_suffix =="invalid":
|
if output_suffix == "qualification" or output_suffix =="invalid":
|
||||||
extract_pages_twice(pdf_path, output_folder, output_suffix)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix)
|
||||||
else:
|
else:
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
@ -249,9 +249,9 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
|
|
||||||
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest18.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest20.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
|
||||||
# truncate_pdf_multiple(input_path,output_folder)
|
# truncate_pdf_multiple(input_path,output_folder)
|
||||||
selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
selection = 4 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# # print("生成的文件:", generated_files)
|
# # print("生成的文件:", generated_files)
|
||||||
|
@ -94,7 +94,7 @@ def transform_json(data):
|
|||||||
|
|
||||||
|
|
||||||
# 读取JSON数据,提取内容,转换结构,并打印结果
|
# 读取JSON数据,提取内容,转换结构,并打印结果
|
||||||
def extract_from_notice(file_path, type):
|
def extract_from_notice(clause_path, type):
|
||||||
if type == 1:
|
if type == 1:
|
||||||
target_values = ["投标文件", "投标"]
|
target_values = ["投标文件", "投标"]
|
||||||
elif type == 2:
|
elif type == 2:
|
||||||
@ -103,7 +103,7 @@ def extract_from_notice(file_path, type):
|
|||||||
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
|
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.")
|
raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.")
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(clause_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
extracted_data = extract_json(data, target_values) # 读取json
|
extracted_data = extract_json(data, target_values) # 读取json
|
||||||
transformed_data = transform_json(extracted_data)
|
transformed_data = transform_json(extracted_data)
|
||||||
|
@ -125,6 +125,9 @@ def convert_to_json(file_path, start_word, end_phrases):
|
|||||||
return parsed_data
|
return parsed_data
|
||||||
|
|
||||||
def convert_clause_to_json(input_path,output_folder,type=1):
|
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||||
|
if not os.path.exists(input_path):
|
||||||
|
print(f"The specified file does not exist: {input_path}")
|
||||||
|
return ""
|
||||||
if type==1:
|
if type==1:
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
end_phrases = [
|
end_phrases = [
|
||||||
|
@ -43,15 +43,15 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
index = addfileToKnowledge(docx_path, knowledge_name)
|
index = addfileToKnowledge(docx_path, knowledge_name)
|
||||||
|
|
||||||
# 调用截取PDF多次
|
# 调用截取PDF多次
|
||||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) # [前附表, 评标办法, 须知正文, 资格审查条件,无效标]
|
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) # [前附表, 评标办法, 须知正文, 资格审查条件]
|
||||||
|
print(truncate_files)
|
||||||
|
|
||||||
# 处理各个部分
|
# 处理各个部分
|
||||||
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
|
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
|
||||||
|
|
||||||
invalid_docpath = copy_docx(docx_path) #docx截取无效标部分
|
invalid_docpath = copy_docx(docx_path) #docx截取无效标部分
|
||||||
|
|
||||||
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
truncate_jsonpath=extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json,从表格提取数据
|
||||||
extract_tables_main(truncate0_docpath, truncate0_jsonpath) # 投标人须知前附表docx->json,从表格提取数据
|
|
||||||
truncate0 = truncate_files[0] #投标人须知前附表
|
truncate0 = truncate_files[0] #投标人须知前附表
|
||||||
truncate1 = truncate_files[1] #评标办法前附表
|
truncate1 = truncate_files[1] #评标办法前附表
|
||||||
truncate3 = truncate_files[3] #资格审查表
|
truncate3 = truncate_files[3] #资格审查表
|
||||||
@ -65,7 +65,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
'truncate3': truncate3,
|
'truncate3': truncate3,
|
||||||
'knowledge_index': index,
|
'knowledge_index': index,
|
||||||
'knowledge_name': knowledge_name,
|
'knowledge_name': knowledge_name,
|
||||||
'truncate0_jsonpath': truncate0_jsonpath,
|
'truncate0_jsonpath': truncate_jsonpath,
|
||||||
'clause_path': clause_path,
|
'clause_path': clause_path,
|
||||||
'invalid_docpath': invalid_docpath
|
'invalid_docpath': invalid_docpath
|
||||||
}
|
}
|
||||||
@ -215,7 +215,6 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
#
|
#
|
||||||
# deleteKnowledge(processed_data['knowledge_index'])
|
# deleteKnowledge(processed_data['knowledge_index'])
|
||||||
|
|
||||||
# TODO:如果上传的是pdf转过的docx文件,那么提取打勾符号就会有问题 zbtest20 跳转涉及二级跳转 对于跳转到第一章 招标公告的要做额外处理 资格审查位置在第一章后面。如果未截取成功,需要作额外处理 logger不能保存控制台输出
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3"
|
||||||
|
|
||||||
@ -227,7 +226,9 @@ if __name__ == "__main__":
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
file_type = 1 #1:docx 2:pdf 3:其他
|
file_type = 1 #1:docx 2:pdf 3:其他
|
||||||
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3\\zbtest20.docx"
|
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\test3\\zbtest20.docx"
|
||||||
file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
|
# file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
|
||||||
|
|
||||||
|
preprocess_files(output_folder, input_file, file_type, "unique_id")
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
elapsed_time = end_time - start_time # 计算耗时
|
elapsed_time = end_time - start_time # 计算耗时
|
||||||
print(f"Function execution took {elapsed_time} seconds.")
|
print(f"Function execution took {elapsed_time} seconds.")
|
||||||
|
@ -78,8 +78,8 @@ def get_consortium_dict(knowledge_name):
|
|||||||
return consortium_dict
|
return consortium_dict
|
||||||
|
|
||||||
def get_all_dict(knowledge_name):
|
def get_all_dict(knowledge_name):
|
||||||
qualification_review_file_path = '../static/提示词/资格评审.txt' # 替换为你的txt文件路径
|
# qualification_review_file_path = '../static/提示词/资格评审.txt' # 替换为你的txt文件路径
|
||||||
# # qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
|
qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
|
||||||
questions = read_questions_from_file(qualification_review_file_path)
|
questions = read_questions_from_file(qualification_review_file_path)
|
||||||
qualification_list = []
|
qualification_list = []
|
||||||
res1 = multi_threading(questions, knowledge_name)
|
res1 = multi_threading(questions, knowledge_name)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user