2024-08-29 16:37:09 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
import json
|
|
|
|
|
import os.path
|
|
|
|
|
import time
|
|
|
|
|
import re
|
2024-11-11 17:12:38 +08:00
|
|
|
|
|
|
|
|
|
from flask_app.general.format_change import pdf2docx
|
2024-11-12 17:12:47 +08:00
|
|
|
|
from flask_app.general.无效标和废标公共代码 import extract_text_with_keywords, preprocess_text_list, clean_dict_datas, \
|
|
|
|
|
read_tables_from_docx, extract_table_with_keywords
|
2024-11-04 10:52:23 +08:00
|
|
|
|
from flask_app.general.通义千问long import upload_file, qianwen_long,qianwen_long_text
|
2024-08-29 16:37:09 +08:00
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
2024-11-11 17:12:38 +08:00
|
|
|
|
|
|
|
|
|
from flask_app.main.table_content_extraction import extract_tables_main
|
2024-09-11 12:02:09 +08:00
|
|
|
|
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
|
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
#处理无效投标
|
|
|
|
|
def extract_values_if_contains(data, includes):
|
|
|
|
|
"""
|
|
|
|
|
递归检查字典中的值是否包含列表 'includes' 中的内容。
|
|
|
|
|
如果包含,将这些值添加到一个列表中并返回。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
data (dict): 字典或从 JSON 解析得到的数据。
|
|
|
|
|
includes (list): 包含要检查的关键词的列表。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
list: 包含满足条件的值的列表。
|
|
|
|
|
"""
|
|
|
|
|
included_values = [] # 初始化结果列表
|
|
|
|
|
|
|
|
|
|
# 定义递归函数来处理嵌套字典
|
|
|
|
|
def recursive_search(current_data):
|
|
|
|
|
if isinstance(current_data, dict):
|
|
|
|
|
for key, value in current_data.items():
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
# 如果值是字典,递归搜索
|
|
|
|
|
recursive_search(value)
|
|
|
|
|
elif isinstance(value, str):
|
|
|
|
|
# 如果值是字符串,检查是否包含任何 includes 中的关键词
|
|
|
|
|
if any(include in value for include in includes):
|
|
|
|
|
included_values.append(value)
|
|
|
|
|
elif isinstance(current_data, list):
|
|
|
|
|
for item in current_data:
|
|
|
|
|
# 如果是列表,递归每个元素
|
|
|
|
|
recursive_search(item)
|
|
|
|
|
|
|
|
|
|
# 开始递归搜索
|
|
|
|
|
recursive_search(data)
|
|
|
|
|
|
|
|
|
|
return included_values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。
|
|
|
|
|
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。
|
|
|
|
|
#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
|
|
|
|
|
|
2024-10-30 16:56:05 +08:00
|
|
|
|
#TODO:truncate_json_path为空的时候,单独提取表格数据
|
2024-11-12 17:12:47 +08:00
|
|
|
|
def handle_query(file_path, user_query, output_file, result_key, keywords):
|
2024-11-04 10:52:23 +08:00
|
|
|
|
try:
|
2024-11-12 17:12:47 +08:00
|
|
|
|
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:","我方"]
|
2024-11-04 10:52:23 +08:00
|
|
|
|
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
|
|
|
|
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 提取正文(除表格)
|
|
|
|
|
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
2024-11-12 17:12:47 +08:00
|
|
|
|
table_data_list = read_tables_from_docx(file_path)
|
|
|
|
|
all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords)
|
2024-11-04 10:52:23 +08:00
|
|
|
|
qianwen_txt = all_texts1 + all_tables1
|
|
|
|
|
selected_contents = set() # 使用 set 去重
|
|
|
|
|
|
|
|
|
|
if qianwen_txt:
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as file:
|
|
|
|
|
counter = 1
|
|
|
|
|
for content in qianwen_txt:
|
|
|
|
|
file.write("..............." + '\n')
|
|
|
|
|
file.write(f"{counter}. {content}\n")
|
|
|
|
|
counter += 1
|
|
|
|
|
|
|
|
|
|
file_id = upload_file(output_file)
|
|
|
|
|
# qianwen_ans = qianwen_long(file_id, user_query)
|
|
|
|
|
qianwen_ans = qianwen_long_text(file_id, user_query)
|
|
|
|
|
num_list = process_string_list(qianwen_ans)
|
|
|
|
|
print(result_key + "选中的序号:" + str(num_list))
|
|
|
|
|
|
|
|
|
|
for index in num_list:
|
|
|
|
|
if 1 <= index <= len(qianwen_txt):
|
|
|
|
|
content = qianwen_txt[index - 1]
|
|
|
|
|
selected_contents.add(content)
|
|
|
|
|
|
|
|
|
|
# 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
|
|
|
|
|
selected_contents.update(all_texts2)
|
|
|
|
|
selected_contents.update(all_tables2)
|
|
|
|
|
|
|
|
|
|
# 如果 selected_contents 不为空,则返回结果,否则返回空字符串
|
|
|
|
|
if selected_contents:
|
|
|
|
|
res = {result_key: list(selected_contents)}
|
|
|
|
|
else:
|
|
|
|
|
res = {result_key: ""}
|
|
|
|
|
|
|
|
|
|
return res
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
|
|
|
|
return {result_key: ""}
|
|
|
|
|
|
2024-11-13 09:41:37 +08:00
|
|
|
|
#TODO:增加提取逻辑 else里面的内容
|
2024-11-12 17:12:47 +08:00
|
|
|
|
def combine_find_invalid(file_path, output_folder, qualification):
|
|
|
|
|
os.makedirs(output_folder, exist_ok=True)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
queries = [
|
2024-11-04 10:52:23 +08:00
|
|
|
|
(
|
2024-11-12 17:12:47 +08:00
|
|
|
|
r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|无\s*效\s*标|视\s*为\s*无\s*效|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
2024-11-04 10:52:23 +08:00
|
|
|
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
2024-11-12 17:12:47 +08:00
|
|
|
|
os.path.join(output_folder, "temp1.txt"),
|
2024-11-04 10:52:23 +08:00
|
|
|
|
"否决和无效投标情形"
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
r'废\s*标',
|
|
|
|
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
2024-11-12 17:12:47 +08:00
|
|
|
|
os.path.join(output_folder, "temp2.txt"),
|
2024-11-04 10:52:23 +08:00
|
|
|
|
"废标项"
|
|
|
|
|
)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
]
|
|
|
|
|
results = []
|
|
|
|
|
# 使用线程池来并行处理查询
|
|
|
|
|
with ThreadPoolExecutor() as executor:
|
|
|
|
|
futures = []
|
|
|
|
|
for keywords, user_query, output_file, result_key in queries:
|
2024-11-12 17:12:47 +08:00
|
|
|
|
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords)
|
|
|
|
|
futures.append((future, result_key)) # 保持顺序
|
2024-11-04 10:52:23 +08:00
|
|
|
|
time.sleep(0.5) # 暂停0.5秒后再提交下一个任务
|
2024-11-12 17:12:47 +08:00
|
|
|
|
|
2024-11-04 10:52:23 +08:00
|
|
|
|
for future, result_key in futures:
|
|
|
|
|
try:
|
|
|
|
|
result = future.result()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"线程处理 {result_key} 时出错: {e}")
|
|
|
|
|
result = {result_key: ""}
|
|
|
|
|
results.append(result)
|
|
|
|
|
|
|
|
|
|
# 禁止投标(find_forbidden)部分
|
|
|
|
|
try:
|
|
|
|
|
# print("starting不得存在的情形...")
|
2024-11-12 17:12:47 +08:00
|
|
|
|
forbidden_res = find_forbidden(qualification)
|
2024-11-04 10:52:23 +08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"find_forbidden 处理时出错: {e}")
|
|
|
|
|
forbidden_res = {'不得存在的其他情形': ""}
|
2024-08-29 16:37:09 +08:00
|
|
|
|
results.append(forbidden_res)
|
|
|
|
|
|
|
|
|
|
combined_dict = {}
|
|
|
|
|
for d in results:
|
|
|
|
|
combined_dict.update(d)
|
2024-11-04 10:52:23 +08:00
|
|
|
|
# print("无效标与废标done...")
|
|
|
|
|
return {"无效标与废标项": combined_dict}
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
start_time = time.time()
|
2024-11-12 17:12:47 +08:00
|
|
|
|
# tobidders_notice_table=""
|
|
|
|
|
# clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
|
|
|
|
|
qualification=r"D:\flask_project\flask_app\static\output\output1\fb297c04-b7ef-4b34-9df8-d97b6af69a03\ztbfile_qualification.pdf"
|
|
|
|
|
output_folder = r"D:\flask_project\flask_app\static\output\output1\fb297c04-b7ef-4b34-9df8-d97b6af69a03\tmp"
|
2024-09-11 12:02:09 +08:00
|
|
|
|
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
|
2024-11-12 17:12:47 +08:00
|
|
|
|
file_path = r'D:\flask_project\flask_app\static\output\output1\fb297c04-b7ef-4b34-9df8-d97b6af69a03\ztbfile.docx'
|
|
|
|
|
results = combine_find_invalid(file_path, output_folder,qualification)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
end_time = time.time()
|
|
|
|
|
print("Elapsed time:", str(end_time - start_time))
|
2024-10-10 11:52:37 +08:00
|
|
|
|
print("Results:", json.dumps(results,ensure_ascii=False,indent=4))
|