zbparse/flask_app/main/无效标和废标和禁止投标整合.py
2024-08-29 16:37:09 +08:00

303 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import json
import os.path
import time
import re
from json_utils import combine_json_results, nest_json_under_key
from 通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
from 禁止投标情形 import find_forbidden
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
from collections import OrderedDict
from docx import Document
import re
doc = Document(doc_path)
extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序
continue_collecting = False
current_section_pattern = None
active_key = None # 用来标记当前正在收集的文本块的键
def match_keywords(text, patterns):
"""使用正则表达式匹配关键词。"""
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
def extract_from_text(text, index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "": # Skip empty lines
return
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None # 结束当前的收集
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
if match_keywords(text, keywords):
active_key = text # 设置当前的关键词块
extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本
# 检查是否也匹配后续关键词
if match_keywords(text, follow_up_keywords):
continue_collecting = True
# 设置跟踪模式
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b'
current_section_pattern = re.compile(pattern)
else:
found_next_number = False
current_section_pattern = None
for next_index in range(index + 1, len(doc.paragraphs)):
next_text = doc.paragraphs[next_index].text.strip()
if not found_next_number:
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
if next_section_number:
found_next_number = True
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text) # 持续收集
for index, para in enumerate(doc.paragraphs):
extract_from_text(para.text.strip(), index)
return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
all_texts1 = []
all_texts2=[]
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
split_pattern = r'(?<=[。!?\!\?])'
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
for data in text_list:
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data)
if keyword_match:
# 从关键词位置开始查找结束标点符号
start_pos = keyword_match.start()
# 截取从关键词开始到后面的内容
substring = data[start_pos:]
# 按定义的结束标点分割
sentences = re.split(split_pattern, substring, 1)
if len(sentences) > 0 and sentences[0]:
# 只取第一句,保留标点
cleaned_text = data[:start_pos] + sentences[0]
else:
cleaned_text = data # 如果没有标点,使用整个字符串
else:
# 如果没有找到关键词,保留原文本
cleaned_text = data
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
else:
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', text_list[0]).strip()
# 将修改后的第一个元素和剩余的元素连接起来
text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
return all_texts1,all_texts2
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
sentences1 = [] # 保存没有后续关键词的情况
sentences2 = [] # 保存有后续关键词的情况
if isinstance(data, dict):
for value in data.values():
result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
sentences1.extend(result1)
sentences2.extend(result2)
elif isinstance(data, list):
for item in data:
result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
sentences1.extend(result1)
sentences2.extend(result2)
elif isinstance(data, str):
# 分割句子,保证句子完整性
split_sentences = re.split(r'(?<=[。!?\!\?])', data)
i = 0
while i < len(split_sentences):
sentence = split_sentences[i]
if re.search(keywords, sentence, re.IGNORECASE):
follow_up_present = any(
re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
if follow_up_present:
# 如果存在后续关键词,则从当前位置开始截取
start_index = i
end_index = start_index
found_next_section = False
for j in range(start_index + 1, len(split_sentences)):
if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
end_index = j
found_next_section = True
break
if found_next_section:
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
else:
full_text = ' '.join(split_sentences[start_index:]).strip()
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data=re.sub(pattern,'',full_text)
sentences2.append(data) # 存储有后续关键词的情况
i = end_index if found_next_section else len(split_sentences)
else:
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', sentence).replace('\n','').strip()
sentences1.append(data) # 存储没有后续关键词的情况
i += 1
else:
i += 1
return sentences1, sentences2 # 返回两个列表
def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
with open(json_path, 'r', encoding='utf-8') as file:
data = json.load(file)
"""从JSON数据中提取包含关键词的句子。"""
return find_sentences_with_keywords(data, keywords,follow_up_keywords)
#处理无效投标
def extract_values_if_contains(data, includes):
"""
递归检查字典中的值是否包含列表 'includes' 中的内容。
如果包含,将这些值添加到一个列表中并返回。
参数:
data (dict): 字典或从 JSON 解析得到的数据。
includes (list): 包含要检查的关键词的列表。
返回:
list: 包含满足条件的值的列表。
"""
included_values = [] # 初始化结果列表
# 定义递归函数来处理嵌套字典
def recursive_search(current_data):
if isinstance(current_data, dict):
for key, value in current_data.items():
if isinstance(value, dict):
# 如果值是字典,递归搜索
recursive_search(value)
elif isinstance(value, str):
# 如果值是字符串,检查是否包含任何 includes 中的关键词
if any(include in value for include in includes):
included_values.append(value)
elif isinstance(current_data, list):
for item in current_data:
# 如果是列表,递归每个元素
recursive_search(item)
# 开始递归搜索
recursive_search(data)
return included_values
#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。
#以上是原文内容,文本内的信息以'...............'分割请你根据该信息回答否决投标或拒绝投标或无效投标或使投标失效的情况有哪些文本中可能存在无关的信息请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
follow_up_keywords = [r'\s*形\s*之\s*一', r'\s*况\s*之\s*一', r'\s*列', r'\s*下']
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
qianwen_txt = all_texts1 + all_tables1
# Proceed only if there is content to write
if qianwen_txt:
with open(output_file, 'w', encoding='utf-8') as file:
# 初始化一个计数器
counter = 1
for content in qianwen_txt:
file.write("..............."+'\n')
# 写入内容前加上序号,后面接一个点和空格,然后是内容
file.write(f"{counter}. {content}\n")
# 更新计数器,每次循环递增
counter += 1
file_id = upload_file(output_file)
print("starting qianwen-long...")
qianwen_ans = qianwen_long(file_id, user_query)
selected_contents = []
num_list = json.loads(qianwen_ans)
print(num_list)
for index in num_list:
if index - 1 < len(qianwen_txt):
content = qianwen_txt[index - 1] # 转换序号为索引假设序号从1开始
selected_contents.append(content)
selected_contents += all_texts2
selected_contents += all_tables2
# 创建一个字典来保存结果
res = {result_key: selected_contents}
# 将结果转换为JSON字符串
# os.remove(output_file) # Remove the file after use
# print(f"Deleted temporary file: {output_file}")
else:
res = {result_key: ""} # Set the response to empty if no contents were extracted
return res
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate4):
print("starting无效标与废标...")
queries = [
(r'\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。",
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
(r'\s*标',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。",
os.path.join(output_dir, "temp2.txt"), "废标项")
]
results = []
# 使用线程池来并行处理查询
with ThreadPoolExecutor() as executor:
futures = []
for keywords, user_query, output_file, result_key in queries:
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,
truncate_json_path)
futures.append(future)
time.sleep(1) # 暂停1秒后再提交下一个任务
for future in futures:
results.append(future.result())
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate4)
results.append(forbidden_res)
combined_dict = {}
for d in results:
combined_dict.update(d)
print("无效标与废标done...")
return nest_json_under_key(combined_dict, "无效标与废标项")
#TODO:1.运行时间约80s如果成为短板需要优化多线程 2.没有提取评标办法前附表中的表格 3.提取表格时根据中文的句号分割 4.qianwen-long存在bug
if __name__ == '__main__':
start_time = time.time()
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate4="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate4)
end_time = time.time()
print("Elapsed time:", str(end_time - start_time))
print("Results:", results)