zbparse/flask_app/general/无效标和废标公共代码.py

714 lines
37 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import re
import regex
import time
from concurrent.futures import ThreadPoolExecutor
from flask_app.general.doubao import generate_full_user_query
from flask_app.general.format_change import pdf2docx
from flask_app.general.insert_del_pagemark import insert_mark
from flask_app.general.通义千问long import qianwen_plus
from flask_app.general.通用功能函数 import process_string_list
from collections import OrderedDict
from docx import Document
def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达式提取到的东西格式化
all_text1 = {} # 用于存储 len(text_list) == 1 的结果,按序号保存
all_text2 = {} # 用于存储 len(text_list) > 1 的结果,按序号保存
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
split_pattern = r'(?<=[。!?\!\?])'
clean_pattern = (r'^\s*(?:[(]\s*\d+\s*[))]|'
r'[A-Za-z]?\d+(?:\.\s*\d+)+[\s*\.、.)]?|' #12.1
r'[一二三四五六七八九十]+、|'
r'[A-Za-z]\s*[)\.、.]?\s*)|'
r'\d+\s*[)\.、.]?\s*|'
r'[\s*\.、.)]?')
idx = 0 # 共享的序号标记
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
for data in text_list:
# print(data)
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号eg:1 | (1) |2 | 1. | 2全角点| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
# print("---------")
# print(data)
data = re.sub(clean_pattern, '', data).strip()
# print(data)
keyword_match = re.search(keywords, data)
if keyword_match:
# 从关键词位置开始查找结束标点符号
start_pos = keyword_match.start()
# 截取从关键词开始到后面的内容
substring = data[start_pos:]
# 按定义的结束标点分割
sentences = re.split(split_pattern, substring, 1)
if len(sentences) > 0 and sentences[0]:
# 只取第一句,保留标点
cleaned_text = data[:start_pos] + sentences[0] # eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
else:
cleaned_text = data # 如果没有标点,使用整个字符串
else:
# 如果没有找到关键词,保留原文本
cleaned_text = data
# 删除空格
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
# 如果长度大于8则添加到结果列表
if len(cleaned_text_no_spaces) > 8:
all_text1[idx] = cleaned_text_no_spaces
idx += 1 # 更新共享序号
else:
# print(text_list)
# print("*********")
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
data = re.sub(clean_pattern, '', text_list[0]).strip() # 只去除第一个的序号
# 将修改后的第一个元素和剩余的元素连接起来
text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
# 删除空格
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
all_text2[idx] = joined_text_no_spaces
idx += 1 # 更新共享序号
return all_text1, all_text2 # all_texts1要额外用gpt all_text2直接返回结果
#处理跨页的段落
def preprocess_paragraphs(elements):
processed = [] # 初始化处理后的段落列表
index = 0
flag = False # 初始化标志位
# 定义列表项的模式
list_item_pattern = re.compile(
r'^\s*('
r'[\(]\d+[\)]|' # 匹配:(1) 或 1
r'[A-Za-z]\.\s*|' # 匹配A. 或 b.
r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、
r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配第x章第x部分第x节
r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\]?|' # 匹配A1.2 等
r'\d+(?:\.\d+)+[\s\.、.)\]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配数字序号如1.1 1.1.1
r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字
r'(?=\d+[、.])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号
r')'
)
# 新增的正则表达式,用于匹配以数字序号开头的段落
pattern_numeric_header = re.compile(
r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)' # 匹配如 '12.1 内容'
)
pattern_numeric_header_fallback = re.compile(
r'^(\d+\.)\s*(.+)$' # 匹配如 '12. 内容'
)
# pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
# pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
# 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
def has_long_spaces(text, max_space_count=5):
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
# 正则表达式用于检测页面标记
pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
# 辅助函数:查找上一个非空且非标记的段落
def find_prev_text(current_index):
for i in range(current_index - 1, -1, -1):
if isinstance(elements[i], str):
return '', -1
try:
text = elements[i].text.strip()
except AttributeError:
continue # 如果段落对象没有 text 属性,跳过
if text and not pattern_marker.search(text):
return text, i
return '', -1
# 辅助函数:查找下一个非空且非标记的段落
def find_next_text(current_index):
for i in range(current_index + 1, len(elements)):
if isinstance(elements[i], str):
return '', -1
try:
text = elements[i].text.strip()
except AttributeError:
continue # 如果段落对象没有 text 属性,跳过
# 跳过空白段落和页面标记
if not text or pattern_marker.search(text):
continue
return text, i
return '', -1
while index < len(elements):
if isinstance(elements[index], str):
processed.append(elements[index])
index += 1
continue
try:
current_text = elements[index].text.strip() # 去除当前段落的前后空白
# print(current_text)
except AttributeError:
# 如果段落对象没有 text 属性,跳过该段落
index += 1
continue
# 检查当前段落是否为页面标记
if pattern_marker.search(current_text):
# 动态查找前一个非空段落
prev_text, prev_index = find_prev_text(index)
# 动态查找后一个非空段落
next_text, next_index = find_next_text(index)
# 应用现有的合并逻辑
if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
if not prev_text.endswith(('', '!', '?')): # '', ',', 先注释了,如果逗号,可能还没结束。
# 检查后一个段落是否为列表项
if not list_item_pattern.match(next_text) and len(prev_text) > 30:
# 合并前后段落
merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
if prev_index < len(elements):
# 移除 processed 中的前一个段落
if processed and processed[-1] == prev_text:
processed.pop()
# 添加合并后的文本
processed.append(merged_text)
# 跳过标记以及前后所有空白段落,直到 next_index
index = next_index + 1
continue # 继续下一个循环
# 如果不满足合并条件,跳过标记及其周围的空白段落
# 计算下一个需要处理的索引
# 从当前 index 向下,跳过所有连续的空白段落和标记
skip_index = index + 1
while skip_index < len(elements):
if isinstance(elements[skip_index], str):
break
try:
skip_text = elements[skip_index].text.strip()
except AttributeError:
skip_index += 1
continue # 如果段落对象没有 text 属性,跳过
if skip_text == '' or pattern_marker.search(skip_text):
skip_index += 1
else:
break
index = skip_index
continue # 继续下一个循环
# 检查是否为以数字序号开头的段落eg:11.2
match = pattern_numeric_header.match(current_text)
if not match:
match = pattern_numeric_header_fallback.match(current_text)
if match:
# 当前段落以数字序号开头,直接添加到 processed
processed.append(current_text)
flag = True # 设置标志位,准备处理下一个段落
index += 1
continue
else:
if flag:
if not list_item_pattern.match(current_text):
if processed:
# **新增逻辑开始**
next_non_empty_text, next_non_empty_index = find_next_text(index)
is_next_numbered = False
if next_non_empty_text:
is_next_numbered = bool(
pattern_numeric_header.match(next_non_empty_text) or
pattern_numeric_header_fallback.match(next_non_empty_text)
)
if is_next_numbered and len(processed[-1]) > 30:
# 只有在下一个段落以数字序号开头且上一个段落长度大于30时才将当前段落追加到上一个段落
processed[-1] = processed[-1] + ' ' + current_text
else:
# 否则,不追加,而是作为新的段落添加
processed.append(current_text)
# **新增逻辑结束**
else:
# **新增处理:匹配 list_item_pattern 的段落也应被保存**
processed.append(current_text)
# 无论是否追加,都将 flag 重置
flag = False
index += 1
continue
else:
# flag 为 False直接添加到 processed
processed.append(current_text)
index += 1
continue
return processed
def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keywords):
if isinstance(keywords, str):
keywords = [keywords]
extracted_paragraphs = OrderedDict()
continue_collecting = False
current_section_pattern = None
active_key = None
# 新增的条件:检查是否匹配 pattern_numbered 或 pattern_parentheses
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
pattern_letter_initial = re.compile(r'^([A-Z])\s*[..、]?\s*(.*)$')
def match_keywords(text, patterns):
# 首先检查关键词是否匹配
for pattern in patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def extract_from_text(text, current_index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "":
return current_index
def process_matching_section():
nonlocal current_index, continue_collecting, active_key, current_section_pattern
found_next_number = False
current_section_pattern = None
try:
while current_index < len(processed_paragraphs) - 1:
current_index += 1
try:
next_text = processed_paragraphs[current_index].strip()
except IndexError:
print(f"IndexError: current_index={current_index} is out of range.")
break # Exit the loop if index is out of range
if not next_text:
continue # Skip empty lines
if not found_next_number:
# Modify the regular expression to support the required format
number_pattern = (
r'^([A-Za-z]+(?:[.][A-Za-z]+)*)|' # 仅字母模式
r'^([0-9]+(?:[.][0-9]+)*)|' # 仅数字模式
r'([(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[)])|' # 括号内的中文数字或阿拉伯数字
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' # 中文数字或阿拉伯数字加顿号
)
next_section_number = re.match(number_pattern, next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
# 仅字母模式
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'[A-Za-z]+' + r'(?:[.][A-Za-z]+)*' + r'\b'
elif next_section_number.group(2):
# 仅数字模式
section_parts = next_section_number.group(2).split('.')
dynamic_pattern = r'^' + r'[0-9]+' + r'(?:[.][0-9]+)*' + r'\b'
elif next_section_number.group(3):
# 括号内的中文数字或阿拉伯数字
dynamic_pattern = r'^[(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[)]'
elif next_section_number.group(4):
# 中文数字或阿拉伯数字加顿号
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
else:
dynamic_pattern = None
if dynamic_pattern:
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key = None
break
except Exception as e:
print(f"An unexpected error occurred in process_matching_section: {e}")
# Optionally, you can re-raise the exception or handle it as needed
# raise e
if continue_collecting:
if text == '[$$table_start$$]':
current_index += 1
while (processed_paragraphs[current_index] != '[$$table_over$$]'):
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
current_index += 1
return current_index
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
return current_index
try:
if match_keywords(text, keywords):
active_key = text
extracted_paragraphs[active_key] = [text]
if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text.replace(' ','')) < 10:
continue_collecting = True
process_matching_section()
elif match_keywords(text, follow_up_keywords):
continue_collecting = True
section_number = re.match(r'^(\d+([.]\d+)*)\s*[..、]?', text)
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
parts = current_section_number.split('.')
matched_patterns = []
if len(parts) > 1:
for i in range(1, 6):
parent_section_parts = parts[:-1].copy()
try:
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
except ValueError:
print(f"ValueError: Unable to convert '{parent_section_parts[-1]}' to int.")
continue
parent_pattern = r'^' + r'\s*[..、]\s*'.join(parent_section_parts) + r'(?!\s*[..、]\s*\d+)'
matched_patterns.append(parent_pattern)
try:
current_top_level_num = int(current_section_number.split('.')[0])
for i in range(1, 6):
next_top_level_num = current_top_level_num + i
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..、](?!\d)'
if next_top_level_pattern not in matched_patterns:
matched_patterns.append(next_top_level_pattern)
except ValueError:
print(f"ValueError: Unable to convert '{current_section_number.split('.')[0]}' to int.")
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
try:
current_section_pattern = re.compile(combined_pattern)
except re.error as re_err:
print(f"Regex compilation error: {re_err} with pattern '{combined_pattern}'")
else:
process_matching_section() # Reusing the common function for matching sections
except Exception as e:
print(f"An unexpected error occurred in extract_from_text: {e} 当前index:{current_index}")
return current_index
index = 0
while index < len(processed_paragraphs):
# print(processed_paragraphs[index].strip())
index = extract_from_text(processed_paragraphs[index].strip(), index)
# print("--------------")
index += 1
return extracted_paragraphs
# 分割表格中单元格文本
def split_cell_text(text):
# 定义用于提取括号内容的正则表达式,支持中英文括号
bracket_pattern = re.compile(r'[(][^())]+[)]')
# 1. 先提取并替换括号内容
bracket_contents = []
def replace_bracket_content(match):
bracket_contents.append(match.group(0)) # 保存括号内容
return f"<BRACKET_{len(bracket_contents) - 1}>" # 使用占位符替换括号内容
item_with_placeholders = bracket_pattern.sub(replace_bracket_content, text)
# print("-----------")
# print(text)
# 2. 分割句子,保证句子完整性(按标点符号和序号分割)
split_sentences = regex.split(
r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号后分割
r'(?<![A-Za-z]\s*)(?<!\d[.]?)(?=(?>\d+(?:[.]\d+)+)(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|' # 匹配多级编号,(?=(?>\d+(?:[.]\d+)+)让多级编号匹配“一口气”完成;限制后面不能是指定关键字
r'(?<![+\-×÷*/.\A-Za-z]\s*|\d)(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|' # 数字后跟空格且不跟指定关键字时分割,且前面不包含 . 或 eg:'1.1 xx'
r'(?<![+\-×÷*/.\A-Za-z]\s*|\d)(?=\d+[、.](?!\d|(?:\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家])))|' # 数字后直接跟顿号、点号时分割,且点号后不跟数字 eg:'1.'
r'(?<![A-Za-z])(?=[A-Za-z][.]\s*(?![A-Za-z]))|' # 单个字母+点号或单个字母+数字排除www.baidu.com 网址情况
r'(?=[A-Za-z]+\s*\d+\s*(?:[.]\s*\d+)*)|' # 在字母加数字或多级编号前分割
r'(?<=^|\n)(?=[一二三四五六七八九十]+、)|' # 在中文数字加顿号(如一、二、)前分割,(?<=^|\n)增加了行首限制
r'(?<=^|\n)(?=[①②③④⑤⑥⑦⑧⑨]+)',
item_with_placeholders
)
# 3. 还原括号内容
split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
split_sentences]
# 4. 过滤空字符串
split_sentences = [s for s in split_sentences if s.strip()]
# print(split_sentences)
return split_sentences
# 文件预处理----按文件顺序提取文本和表格,并合并跨页表格
def extract_file_elements(file_path):
doc = Document(file_path)
doc_elements = doc.element.body
doc_paragraphs = doc.paragraphs
doc_tables = doc.tables
pre_table_head = None
table_combine = False
paragraph_index = 0
tables_index = 0
doc_contents = []
pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
# 遍历文件元素
for element in doc_elements:
# 如果是段落
if element.tag.endswith('}p'):
try:
# Ensure we only process paragraphs if paragraph_index is within range
if paragraph_index < len(doc_paragraphs):
# print(f"Processing paragraph {paragraph_index}/{len(doc_paragraphs)}")
text = doc_paragraphs[paragraph_index].text
if pre_table_head:
# If we are still processing a table, check conditions
if (text == '' or pattern_marker.search(text)):
paragraph_index += 1
continue
else:
doc_contents.append('[$$table_over$$]')
table_combine = False
pre_table_head = None
doc_contents.append(doc_paragraphs[paragraph_index])
paragraph_index += 1
else:
raise IndexError(f"Paragraph index {paragraph_index} is out of range. Total paragraphs: {len(doc_paragraphs)}")
except IndexError as e:
print(f"Error processing paragraph: {e}")
continue # Skip to the next element if an error occurs
# 如果是表格
elif element.tag.endswith('}tbl'):
try:
if tables_index < len(doc_tables):
# print(f"Processing table {tables_index}/{len(doc_tables)}")
table = doc_tables[tables_index]
table_content = []
for row_idx, row in enumerate(table.rows):
if row_idx == 0:
if pre_table_head:
table_combine = True
if pre_table_head == row.cells[0].text:
continue
else:
pre_table_head = row.cells[0].text
doc_contents.append('[$$table_start$$]')
continue
for cell in row.cells:
cell_text = cell.text.strip()
if len(cell_text) > 8:
cell_text = split_cell_text(cell_text)
table_content += cell_text
if table_combine:
if doc_contents and table_content and not doc_contents[-1].endswith(('', '!', '?', ';')):
doc_contents[-1] += ' ' + table_content[0]
table_content.pop(0)
doc_contents.extend(table_content)
tables_index += 1
else:
raise IndexError(f"Table index {tables_index} is out of range. Total tables: {len(doc_tables)}")
except IndexError as e:
print(f"Error processing table: {e}")
continue # Skip to the next element if an error occurs
return doc_contents
def handle_query(file_path, user_query, output_file, result_key, keywords):
try:
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"]
follow_up_keywords = [
r'\s*形\s*之\s*一',
r'\s*况\s*之\s*一',
r'\s*列(?!\s*公式)', # 增加负向前瞻,排除“下列公式”
r'\s*下(?!\s*(?:公式|简称))', # 增加负向前瞻,排除“以下公式”
r'\s*他.*?情\s*形\s*[:]',
r'\s*括'
]
doc_contents = extract_file_elements(file_path)
processed_paragraphs = preprocess_paragraphs(doc_contents)
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
# print(all_texts2)
seen_contents = set() # 使用集合来跟踪已出现的内容
unique_all_texts1 = {} # 存储去重后的内容
for key, content in all_texts1.items():
content_key = content[:25] # 提取前25个字符作为去重依据
if content_key not in seen_contents:
unique_all_texts1[key] = content # 保留第一次出现的内容
seen_contents.add(content_key)
# 1. 得到有序的 all_text1_items
all_text1_items = sorted(unique_all_texts1.items(), key=lambda x: x[0])
# 2. 得到纯内容列表
all_texts1_list = [content for (_, content) in all_text1_items]
# Proceed only if there is content to write
selected_contents = {}
final_list=[f"未解析到'{result_key}'"]
if all_texts1_list or all_texts2:
with open(output_file, 'w', encoding='utf-8') as file:
counter = 1
for content in all_texts1_list:
file.write(f"{counter}. {content}\n")
file.write("..............." + '\n')
counter += 1
# 生成用户查询
if not all_texts1_list:
print("无需调用大模型获取选中的序号!")
num_list=[]
else:
user_query = generate_full_user_query(output_file, user_query)
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
# file_id = upload_file(output_file)
# model_ans = qianwen_long(file_id, user_query)
num_list = process_string_list(model_ans) # 处理模型返回的序号
print(result_key + "选中的序号:" + str(num_list))
# print(all_texts1_list)
# print(all_text1_items)
for index in num_list:
if 1 <= index <= len(all_texts1_list):
original_global_idx = all_text1_items[index - 1][0]
content = all_text1_items[index - 1][1]
selected_contents[original_global_idx] = content
# 把选中的 all_text1 内容 + all_text2 合并
merged_dict = {}
# 先合并用户选中的 all_text1
for idx, txt in selected_contents.items():
merged_dict[idx] = txt
# 再合并 all_text2
for idx, txt in all_texts2.items():
merged_dict[idx] = txt
final_list = [txt for idx, txt in sorted(merged_dict.items(), key=lambda x: x[0])]
return {result_key: final_list}
except Exception as e:
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
return {result_key: [f"未解析到'{result_key}'"]}
def combine_find_invalid(invalid_docpath, output_dir):
os.makedirs(output_dir, exist_ok=True)
queries = [
(
r'\s*决|'
r'\s*效\s*投\s*标|'
r'\s*效\s*文\s*件|'
r'(?:文\s*件|投\s*标|响\s*应)\s*[\u4e00-\u9fa5]?\s*(?:无|失)\s*效|'
r'\s*效\s*响\s*应|'
r'\s*效\s*报\s*价|'
r'\s*效\s*标|'
r'\s*为\s*无\s*效|'
r'\s*拒\s*绝|'
r'\s*拒\s*绝|'
r'\s*以\s*拒\s*绝',
"""以下是从招标文件中摘取的内容,文本中序号分明,各信息之间以...............分割。
任务目标:
从文本中筛选所有描述否决投标,拒绝投标,投标、响应无效或类似表述的情况,并返回对应的序号。
要求与指南:
1.投标相关主体:包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
1.文本中可能存在无关的信息,请准确筛选符合条件的信息,即怎样的情况下,投标相关主体的投标将被否决、拒绝,作为无效标,或者是投标无效、响应无效等,请返回符合条件的信息的序号。
2.若条款内容包含'否决投标的情况说明'这样的笼统描述,而未说明具体的情形,则无需添返回该条款。
输出格式:
以 [x, x, x] 的形式返回x 为符合条件的信息的序号,为自然数。无需额外返回解释与说明。
如果文本中没有符合条件的信息,请返回 []。
特殊情况:
如果某序号的内容明显分为几部分且一部分内容符合筛选条件,但其他部分明显是无关内容,请返回符合部分的字符串内容代替序号。
示例输出,仅供格式参考:
[1,3,4,6]
文本内容:{full_text}
""",
os.path.join(output_dir, "temp1.txt"),
"否决和无效投标情形"
),
# (
# r'废\s*标',
# """以下是从招标文件中摘取的内容,文本中序号分明,文本内之间的信息以'...............'分割。
# 任务目标:
# 请根据以下内容,筛选出 废标项的情况 (明确描述导致 废标 的情况)并返回对应的序号。
# 要求与指南:
# 文本中可能存在无关的信息,请准确筛选符合条件的信息,并将符合条件的信息的序号返回。
# 输出格式:
# 返回结果以 [x, x, x] 的形式,其中 x 为符合条件的信息的序号,为自然数。无需额外返回解释与说明。
# 如果文本中没有任何符合条件的废标情况,请返回 []。
# 示例输出,仅供格式参考:
# [1,3,4,6]
# 文本内容:{full_text}
# """,
# os.path.join(output_dir, "temp2.txt"),
# "废标项"
# ),
# (
# r'不\s*得(?!\s*(分|力))|禁\s*止\s*投\s*标',
# """以下是从招标文件中摘取的内容,文本中序号分明,文本内的条款以'...............'分割。条款规定了各方不得存在的情形。请根据以下要求进行筛选:
# **投标相关主体与非投标相关主体的定义**
# 投标相关主体:包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
# 非投标相关主体:包括但不限于“招标人”、“采购人”、“评标委员会”或其他描述非投标方的词语。
# **筛选要求**
# 1. **仅筛选**明确描述投标相关主体禁止情形或不得存在的情形的条款,不包含笼统或未具体说明情形的条款。例如:
# 若条款内容包含'投标人不得存在的其他关联情形'这样的笼统描述,而未说明具体的情形,则无需添加该条款。
# 2. **排除**仅描述非投标相关主体行为限制或禁止情形的条款,例如“招标人不得泄露信息”或“评标委员会不得收受贿赂”,则无需返回。
# 3. 若条款同时描述了对投标相关主体与非投标相关主体的行为限制、禁止情形,也需返回。
# 4. **特殊情况**:如果条款中包含“磋商小组”、”各方“等既能指代投标相关主体又能指代非投标相关主体的词汇:
# 若在语境中其指代或包含投标相关主体,则应将其考虑在内;否则,排除该条款。
#
# **输出格式**
# 返回结果以 [x, x, x] 的形式,其中 x 为符合条件的条款的序号,为自然数。无需额外返回解释与说明。
# 如果没有符合条件的条款,返回 `[]`。
# **示例**
# - **符合条件**
# - `1. 投标人不得...` → 包含,返回序号 1。
# - `3. 联合体投标各方不得...` → 包含,返回序号 3。
# - **不符合条件**
# - `2. 采购人不得...` → 主语为“采购人”,排除。
# -示例输出: [1,3]
# 请根据上述筛选要求,阅读以下文本内容,并返回符合条件的条款序号,
#
# 文本内容:{full_text}
# """,
# os.path.join(output_dir, "temp3.txt"),
# "不得存在的情形"
# )
]
results = []
# 使用线程池来并行处理查询
with ThreadPoolExecutor() as executor:
futures = []
for keywords, user_query, output_file, result_key in queries:
future = executor.submit(handle_query, invalid_docpath, user_query, output_file, result_key, keywords)
futures.append((future, result_key)) # 保持顺序
time.sleep(0.5) # 暂停0.5秒后再提交下一个任务
for future, result_key in futures:
try:
result = future.result()
except Exception as e:
print(f"线程处理 {result_key} 时出错: {e}")
result = {result_key: ""}
results.append(result)
combined_dict = {}
for d in results:
combined_dict.update(d)
print("无效标与废标done...")
return {"无效标与废标项": combined_dict}
if __name__ == '__main__':
start_time = time.time()
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件实高电子显示屏_tobidders_notice_part1.docx"
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
pdf_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output6\招标文件(107国道)_invalid.pdf'
output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
# invalid_added = insert_mark(pdf_path)
# invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\355fbb20-e439-48dd-abdd-7645d424af36\invalid_added.docx'
results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
# # print("Elapsed time:", str(end_time - start_time))