306 lines
16 KiB
Python
306 lines
16 KiB
Python
# -*- coding: utf-8 -*-
|
||
import json
|
||
import os.path
|
||
import time
|
||
import re
|
||
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from flask_app.main.禁止投标情形 import find_forbidden
|
||
from 禁止投标情形 import process_string_list
|
||
|
||
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
||
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
||
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||
from collections import OrderedDict
|
||
from docx import Document
|
||
import re
|
||
|
||
doc = Document(doc_path)
|
||
extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序
|
||
continue_collecting = False
|
||
current_section_pattern = None
|
||
active_key = None # 用来标记当前正在收集的文本块的键
|
||
|
||
def match_keywords(text, patterns):
|
||
"""使用正则表达式匹配关键词。"""
|
||
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
|
||
|
||
def extract_from_text(text, index):
|
||
nonlocal continue_collecting, current_section_pattern, active_key
|
||
if text == "": # Skip empty lines
|
||
return
|
||
if continue_collecting:
|
||
if current_section_pattern and re.match(current_section_pattern, text):
|
||
continue_collecting = False
|
||
active_key = None # 结束当前的收集
|
||
else:
|
||
if active_key is not None:
|
||
extracted_paragraphs[active_key].append(text)
|
||
|
||
if match_keywords(text, keywords):
|
||
active_key = text # 设置当前的关键词块
|
||
extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本
|
||
# 检查是否也匹配后续关键词
|
||
if match_keywords(text, follow_up_keywords):
|
||
continue_collecting = True
|
||
# 设置跟踪模式
|
||
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
|
||
if section_number:
|
||
current_section_number = section_number.group(1)
|
||
level_count = current_section_number.count('.')
|
||
pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b'
|
||
current_section_pattern = re.compile(pattern)
|
||
else:
|
||
found_next_number = False
|
||
current_section_pattern = None
|
||
|
||
for next_index in range(index + 1, len(doc.paragraphs)):
|
||
next_text = doc.paragraphs[next_index].text.strip()
|
||
if not found_next_number:
|
||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
|
||
if next_section_number:
|
||
found_next_number = True
|
||
section_parts = next_section_number.group(1).split('.')
|
||
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||
current_section_pattern = re.compile(dynamic_pattern)
|
||
|
||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||
extracted_paragraphs[active_key].append(next_text) # 持续收集
|
||
|
||
for index, para in enumerate(doc.paragraphs):
|
||
extract_from_text(para.text.strip(), index)
|
||
|
||
return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表
|
||
|
||
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
|
||
all_texts1 = []
|
||
all_texts2=[]
|
||
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
|
||
split_pattern = r'(?<=[。!?\!\?])'
|
||
|
||
for key, text_list in extracted_contents.items():
|
||
if len(text_list) == 1:
|
||
for data in text_list:
|
||
# 检查是否包含任何需要排除的字符串
|
||
if any(exclude in data for exclude in excludes):
|
||
continue # 如果包含任何排除字符串,跳过这个数据
|
||
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
||
data = re.sub(pattern, '', data).strip()
|
||
keyword_match = re.search(keywords, data)
|
||
if keyword_match:
|
||
# 从关键词位置开始查找结束标点符号
|
||
start_pos = keyword_match.start()
|
||
# 截取从关键词开始到后面的内容
|
||
substring = data[start_pos:]
|
||
# 按定义的结束标点分割
|
||
sentences = re.split(split_pattern, substring, 1)
|
||
if len(sentences) > 0 and sentences[0]:
|
||
# 只取第一句,保留标点
|
||
cleaned_text = data[:start_pos] + sentences[0]
|
||
else:
|
||
cleaned_text = data # 如果没有标点,使用整个字符串
|
||
else:
|
||
# 如果没有找到关键词,保留原文本
|
||
cleaned_text = data
|
||
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
|
||
|
||
else:
|
||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
||
data = re.sub(pattern, '', text_list[0]).strip()
|
||
# 将修改后的第一个元素和剩余的元素连接起来
|
||
text_list[0] = data # 更新列表中的第一个元素
|
||
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
|
||
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
|
||
|
||
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
||
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
|
||
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
||
sentences1 = [] # 保存没有后续关键词的情况
|
||
sentences2 = [] # 保存有后续关键词的情况
|
||
|
||
if isinstance(data, dict):
|
||
for value in data.values():
|
||
result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
|
||
sentences1.extend(result1)
|
||
sentences2.extend(result2)
|
||
elif isinstance(data, list):
|
||
for item in data:
|
||
result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
|
||
sentences1.extend(result1)
|
||
sentences2.extend(result2)
|
||
elif isinstance(data, str):
|
||
# 分割句子,保证句子完整性
|
||
split_sentences = re.split(r'(?<=[。!?\!\?])', data)
|
||
i = 0
|
||
while i < len(split_sentences):
|
||
sentence = split_sentences[i]
|
||
if re.search(keywords, sentence, re.IGNORECASE):
|
||
follow_up_present = any(
|
||
re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
|
||
if follow_up_present:
|
||
# 如果存在后续关键词,则从当前位置开始截取
|
||
start_index = i
|
||
end_index = start_index
|
||
found_next_section = False
|
||
for j in range(start_index + 1, len(split_sentences)):
|
||
if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
|
||
end_index = j
|
||
found_next_section = True
|
||
break
|
||
|
||
if found_next_section:
|
||
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
||
else:
|
||
full_text = ' '.join(split_sentences[start_index:]).strip()
|
||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||
data=re.sub(pattern,'',full_text)
|
||
sentences2.append(data) # 存储有后续关键词的情况
|
||
i = end_index if found_next_section else len(split_sentences)
|
||
else:
|
||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||
data = re.sub(pattern, '', sentence).replace('\n','').strip()
|
||
sentences1.append(data) # 存储没有后续关键词的情况
|
||
i += 1
|
||
else:
|
||
i += 1
|
||
|
||
return sentences1, sentences2 # 返回两个列表
|
||
|
||
def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
|
||
with open(json_path, 'r', encoding='utf-8') as file:
|
||
data = json.load(file)
|
||
"""从JSON数据中提取包含关键词的句子。"""
|
||
return find_sentences_with_keywords(data, keywords,follow_up_keywords)
|
||
|
||
#处理无效投标
|
||
def extract_values_if_contains(data, includes):
|
||
"""
|
||
递归检查字典中的值是否包含列表 'includes' 中的内容。
|
||
如果包含,将这些值添加到一个列表中并返回。
|
||
|
||
参数:
|
||
data (dict): 字典或从 JSON 解析得到的数据。
|
||
includes (list): 包含要检查的关键词的列表。
|
||
|
||
返回:
|
||
list: 包含满足条件的值的列表。
|
||
"""
|
||
included_values = [] # 初始化结果列表
|
||
|
||
# 定义递归函数来处理嵌套字典
|
||
def recursive_search(current_data):
|
||
if isinstance(current_data, dict):
|
||
for key, value in current_data.items():
|
||
if isinstance(value, dict):
|
||
# 如果值是字典,递归搜索
|
||
recursive_search(value)
|
||
elif isinstance(value, str):
|
||
# 如果值是字符串,检查是否包含任何 includes 中的关键词
|
||
if any(include in value for include in includes):
|
||
included_values.append(value)
|
||
elif isinstance(current_data, list):
|
||
for item in current_data:
|
||
# 如果是列表,递归每个元素
|
||
recursive_search(item)
|
||
|
||
# 开始递归搜索
|
||
recursive_search(data)
|
||
|
||
return included_values
|
||
|
||
|
||
#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。
|
||
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。
|
||
#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
|
||
|
||
def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
|
||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
|
||
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
||
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
|
||
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
|
||
qianwen_txt = all_texts1 + all_tables1
|
||
# Proceed only if there is content to write
|
||
if qianwen_txt:
|
||
with open(output_file, 'w', encoding='utf-8') as file:
|
||
# 初始化一个计数器
|
||
counter = 1
|
||
for content in qianwen_txt:
|
||
file.write("..............."+'\n')
|
||
# 写入内容前加上序号,后面接一个点和空格,然后是内容
|
||
file.write(f"{counter}. {content}\n")
|
||
# 更新计数器,每次循环递增
|
||
counter += 1
|
||
file_id = upload_file(output_file)
|
||
print("starting qianwen-long...")
|
||
qianwen_ans = qianwen_long(file_id, user_query)
|
||
selected_contents = []
|
||
num_list = process_string_list(qianwen_ans)
|
||
print(num_list)
|
||
|
||
for index in num_list:
|
||
if index - 1 < len(qianwen_txt):
|
||
content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始)
|
||
selected_contents.append(content)
|
||
selected_contents += all_texts2
|
||
selected_contents += all_tables2
|
||
# 创建一个字典来保存结果
|
||
res = {result_key: selected_contents}
|
||
# 将结果转换为JSON字符串
|
||
# os.remove(output_file) # Remove the file after use
|
||
# print(f"Deleted temporary file: {output_file}")
|
||
else:
|
||
res = {result_key: ""} # Set the response to empty if no contents were extracted
|
||
return res
|
||
|
||
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
|
||
print("starting无效标与废标...")
|
||
queries = [
|
||
(r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
|
||
(r'废\s*标',
|
||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||
os.path.join(output_dir, "temp2.txt"), "废标项")
|
||
]
|
||
results = []
|
||
|
||
# 使用线程池来并行处理查询
|
||
with ThreadPoolExecutor() as executor:
|
||
futures = []
|
||
for keywords, user_query, output_file, result_key in queries:
|
||
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,
|
||
truncate_json_path)
|
||
futures.append(future)
|
||
time.sleep(1) # 暂停1秒后再提交下一个任务
|
||
|
||
for future in futures:
|
||
results.append(future.result())
|
||
|
||
#禁止投标
|
||
print("starting不得存在的情形...")
|
||
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
||
results.append(forbidden_res)
|
||
|
||
combined_dict = {}
|
||
for d in results:
|
||
combined_dict.update(d)
|
||
|
||
print("无效标与废标done...")
|
||
return nest_json_under_key(combined_dict, "无效标与废标项")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
start_time = time.time()
|
||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
|
||
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
|
||
truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
|
||
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output"
|
||
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest11_invalid.docx'
|
||
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3)
|
||
end_time = time.time()
|
||
print("Elapsed time:", str(end_time - start_time))
|
||
print("Results:", results)
|