710 lines
37 KiB
Python
710 lines
37 KiB
Python
import json
|
||
import os
|
||
import re
|
||
import regex
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from flask_app.general.llm.大模型通用函数 import generate_full_user_query
|
||
from flask_app.general.llm.qianwen_plus import qianwen_plus
|
||
from flask_app.general.通用功能函数 import process_string_list
|
||
from collections import OrderedDict
|
||
from docx import Document
|
||
|
||
|
||
def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达式提取到的东西格式化
|
||
all_text1 = {} # 用于存储 len(text_list) == 1 的结果,按序号保存
|
||
all_text2 = {} # 用于存储 len(text_list) > 1 的结果,按序号保存
|
||
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
|
||
split_pattern = r'(?<=[。!?\!\?])'
|
||
clean_pattern = (r'^\s*(?:[((]\s*\d+\s*[)))]|'
|
||
r'[A-Za-z]?\d+(?:\.\s*\d+)+[\s*\.、.))]?|' #12.1
|
||
r'[一二三四五六七八九十]+、|'
|
||
r'[A-Za-z]\s*[))\.、.]?\s*)|'
|
||
r'\d+\s*[))\.、.]?\s*|'
|
||
r'[\s*\.、.))]?')
|
||
|
||
idx = 0 # 共享的序号标记
|
||
for key, text_list in extracted_contents.items():
|
||
if len(text_list) == 1:
|
||
for data in text_list:
|
||
# print(data)
|
||
# 检查是否包含任何需要排除的字符串
|
||
if any(exclude in data for exclude in excludes):
|
||
continue # 如果包含任何排除字符串,跳过这个数据
|
||
# 去掉开头的序号,eg:1 | (1) |(2) | 1. | 2.(全角点)| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
|
||
# print("---------")
|
||
# print(data)
|
||
data = re.sub(clean_pattern, '', data).strip()
|
||
# print(data)
|
||
keyword_match = re.search(keywords, data)
|
||
if keyword_match:
|
||
# 从关键词位置开始查找结束标点符号
|
||
start_pos = keyword_match.start()
|
||
# 截取从关键词开始到后面的内容
|
||
substring = data[start_pos:]
|
||
# 按定义的结束标点分割
|
||
sentences = re.split(split_pattern, substring, 1)
|
||
if len(sentences) > 0 and sentences[0]:
|
||
# 只取第一句,保留标点
|
||
cleaned_text = data[:start_pos] + sentences[0] # eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
|
||
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
|
||
else:
|
||
cleaned_text = data # 如果没有标点,使用整个字符串
|
||
else:
|
||
# 如果没有找到关键词,保留原文本
|
||
cleaned_text = data
|
||
# 删除空格
|
||
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
|
||
# 如果长度大于8,则添加到结果列表
|
||
if len(cleaned_text_no_spaces) > 8:
|
||
all_text1[idx] = cleaned_text_no_spaces
|
||
idx += 1 # 更新共享序号
|
||
else:
|
||
# print(text_list)
|
||
# print("*********")
|
||
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
|
||
data = re.sub(clean_pattern, '', text_list[0]).strip() # 只去除第一个的序号
|
||
# 将修改后的第一个元素和剩余的元素连接起来
|
||
text_list[0] = data # 更新列表中的第一个元素
|
||
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
|
||
# 删除空格
|
||
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
|
||
all_text2[idx] = joined_text_no_spaces
|
||
idx += 1 # 更新共享序号
|
||
|
||
return all_text1, all_text2 # all_texts1要额外用gpt all_text2直接返回结果
|
||
|
||
|
||
def preprocess_paragraphs(elements):
|
||
'''
|
||
处理跨页的段落,程序逻辑判断两个段落能否合并在一起。
|
||
'''
|
||
processed = [] # 初始化处理后的段落列表
|
||
index = 0
|
||
flag = False # 初始化标志位
|
||
|
||
# 定义列表项的模式
|
||
list_item_pattern = re.compile(
|
||
r'^\s*('
|
||
r'[(\(]\d+[)\)]|' # 匹配:(1) 或 (1)
|
||
r'[A-Za-z]\.\s*|' # 匹配:A. 或 b.
|
||
r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、
|
||
r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配:第x章,第x部分,第x节
|
||
r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\)]?|' # 匹配:A1.2 等
|
||
r'\d+(?:\.\d+)+[\s\.、.)\)]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配:数字序号如1.1 1.1.1
|
||
r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字
|
||
r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号
|
||
r')'
|
||
)
|
||
|
||
# 新增的正则表达式,用于匹配以数字序号开头的段落
|
||
pattern_numeric_header = re.compile(
|
||
r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)' # 匹配如 '12.1 内容'
|
||
)
|
||
pattern_numeric_header_fallback = re.compile(
|
||
r'^(\d+\.)\s*(.+)$' # 匹配如 '12. 内容'
|
||
)
|
||
# pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||
# pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||
|
||
# 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
|
||
def has_long_spaces(text, max_space_count=5):
|
||
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
||
|
||
# 正则表达式用于检测页面标记
|
||
pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
|
||
|
||
# 辅助函数:查找上一个非空且非标记的段落
|
||
def find_prev_text(current_index):
|
||
for i in range(current_index - 1, -1, -1):
|
||
if isinstance(elements[i], str):
|
||
return '', -1
|
||
try:
|
||
text = elements[i].text.strip()
|
||
except AttributeError:
|
||
continue # 如果段落对象没有 text 属性,跳过
|
||
if text and not pattern_marker.search(text):
|
||
return text, i
|
||
return '', -1
|
||
|
||
# 辅助函数:查找下一个非空且非标记的段落
|
||
def find_next_text(current_index):
|
||
for i in range(current_index + 1, len(elements)):
|
||
if isinstance(elements[i], str):
|
||
return '', -1
|
||
try:
|
||
text = elements[i].text.strip()
|
||
except AttributeError:
|
||
continue # 如果段落对象没有 text 属性,跳过
|
||
# 跳过空白段落和页面标记
|
||
if not text or pattern_marker.search(text):
|
||
continue
|
||
return text, i
|
||
return '', -1
|
||
|
||
while index < len(elements):
|
||
if isinstance(elements[index], str):
|
||
processed.append(elements[index])
|
||
index += 1
|
||
continue
|
||
try:
|
||
current_text = elements[index].text.strip() # 去除当前段落的前后空白
|
||
# print(current_text)
|
||
except AttributeError:
|
||
# 如果段落对象没有 text 属性,跳过该段落
|
||
index += 1
|
||
continue
|
||
|
||
# 检查当前段落是否为页面标记
|
||
if pattern_marker.search(current_text):
|
||
# 动态查找前一个非空段落
|
||
prev_text, prev_index = find_prev_text(index)
|
||
# 动态查找后一个非空段落
|
||
next_text, next_index = find_next_text(index)
|
||
|
||
# 应用现有的合并逻辑
|
||
if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
|
||
if not prev_text.endswith(('。', '!', '?')): # ',', ',', 先注释了,如果逗号,可能还没结束。
|
||
# 检查后一个段落是否为列表项
|
||
if not list_item_pattern.match(next_text) and len(prev_text) > 30:
|
||
# 合并前后段落
|
||
merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
|
||
if prev_index < len(elements):
|
||
# 移除 processed 中的前一个段落
|
||
if processed and processed[-1] == prev_text:
|
||
processed.pop()
|
||
# 添加合并后的文本
|
||
processed.append(merged_text)
|
||
|
||
# 跳过标记以及前后所有空白段落,直到 next_index
|
||
index = next_index + 1
|
||
continue # 继续下一个循环
|
||
|
||
# 如果不满足合并条件,跳过标记及其周围的空白段落
|
||
# 计算下一个需要处理的索引
|
||
# 从当前 index 向下,跳过所有连续的空白段落和标记
|
||
skip_index = index + 1
|
||
while skip_index < len(elements):
|
||
if isinstance(elements[skip_index], str):
|
||
break
|
||
try:
|
||
skip_text = elements[skip_index].text.strip()
|
||
except AttributeError:
|
||
skip_index += 1
|
||
continue # 如果段落对象没有 text 属性,跳过
|
||
if skip_text == '' or pattern_marker.search(skip_text):
|
||
skip_index += 1
|
||
else:
|
||
break
|
||
index = skip_index
|
||
continue # 继续下一个循环
|
||
|
||
# 检查是否为以数字序号开头的段落eg:11.2
|
||
match = pattern_numeric_header.match(current_text)
|
||
if not match:
|
||
match = pattern_numeric_header_fallback.match(current_text)
|
||
|
||
if match:
|
||
# 当前段落以数字序号开头,直接添加到 processed
|
||
processed.append(current_text)
|
||
flag = True # 设置标志位,准备处理下一个段落
|
||
index += 1
|
||
continue
|
||
else:
|
||
if flag:
|
||
if not list_item_pattern.match(current_text):
|
||
if processed:
|
||
# **新增逻辑开始**
|
||
next_non_empty_text, next_non_empty_index = find_next_text(index)
|
||
is_next_numbered = False
|
||
if next_non_empty_text:
|
||
is_next_numbered = bool(
|
||
pattern_numeric_header.match(next_non_empty_text) or
|
||
pattern_numeric_header_fallback.match(next_non_empty_text)
|
||
)
|
||
|
||
if is_next_numbered and len(processed[-1]) > 30:
|
||
# 只有在下一个段落以数字序号开头且上一个段落长度大于30时,才将当前段落追加到上一个段落
|
||
processed[-1] = processed[-1] + ' ' + current_text
|
||
else:
|
||
# 否则,不追加,而是作为新的段落添加
|
||
processed.append(current_text)
|
||
# **新增逻辑结束**
|
||
else:
|
||
# **新增处理:匹配 list_item_pattern 的段落也应被保存**
|
||
processed.append(current_text)
|
||
# 无论是否追加,都将 flag 重置
|
||
flag = False
|
||
index += 1
|
||
continue
|
||
else:
|
||
# flag 为 False,直接添加到 processed
|
||
processed.append(current_text)
|
||
index += 1
|
||
continue
|
||
|
||
return processed
|
||
|
||
def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keywords):
|
||
if isinstance(keywords, str):
|
||
keywords = [keywords]
|
||
extracted_paragraphs = OrderedDict()
|
||
continue_collecting = False
|
||
current_section_pattern = None
|
||
active_key = None
|
||
# 新增的条件:检查是否匹配 pattern_numbered 或 pattern_parentheses
|
||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||
pattern_letter_initial = re.compile(r'^([A-Z])\s*[..、]?\s*(.*)$')
|
||
def match_keywords(text, patterns):
|
||
# 首先检查关键词是否匹配
|
||
for pattern in patterns:
|
||
if re.search(pattern, text, re.IGNORECASE):
|
||
return True
|
||
return False
|
||
|
||
def extract_from_text(text, current_index):
|
||
nonlocal continue_collecting, current_section_pattern, active_key
|
||
if text == "":
|
||
return current_index
|
||
|
||
def process_matching_section():
|
||
nonlocal current_index, continue_collecting, active_key, current_section_pattern
|
||
|
||
found_next_number = False
|
||
current_section_pattern = None
|
||
|
||
try:
|
||
while current_index < len(processed_paragraphs) - 1:
|
||
current_index += 1
|
||
try:
|
||
next_text = processed_paragraphs[current_index].strip()
|
||
except IndexError:
|
||
print(f"IndexError: current_index={current_index} is out of range.")
|
||
break # Exit the loop if index is out of range
|
||
|
||
if not next_text:
|
||
continue # Skip empty lines
|
||
|
||
if not found_next_number:
|
||
# Modify the regular expression to support the required format
|
||
number_pattern = (
|
||
r'^([A-Za-z]+(?:[..][A-Za-z]+)*)|' # 仅字母模式
|
||
r'^([0-9]+(?:[..][0-9]+)*)|' # 仅数字模式
|
||
r'([((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))])|' # 括号内的中文数字或阿拉伯数字
|
||
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' # 中文数字或阿拉伯数字加顿号
|
||
)
|
||
next_section_number = re.match(number_pattern, next_text)
|
||
|
||
if next_section_number:
|
||
found_next_number = True
|
||
if next_section_number.group(1):
|
||
# 仅字母模式
|
||
section_parts = next_section_number.group(1).split('.')
|
||
dynamic_pattern = r'^' + r'[A-Za-z]+' + r'(?:[..][A-Za-z]+)*' + r'\b'
|
||
elif next_section_number.group(2):
|
||
# 仅数字模式
|
||
section_parts = next_section_number.group(2).split('.')
|
||
dynamic_pattern = r'^' + r'[0-9]+' + r'(?:[..][0-9]+)*' + r'\b'
|
||
elif next_section_number.group(3):
|
||
# 括号内的中文数字或阿拉伯数字
|
||
dynamic_pattern = r'^[((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))]'
|
||
elif next_section_number.group(4):
|
||
# 中文数字或阿拉伯数字加顿号
|
||
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
|
||
else:
|
||
dynamic_pattern = None
|
||
|
||
if dynamic_pattern:
|
||
current_section_pattern = re.compile(dynamic_pattern)
|
||
|
||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||
extracted_paragraphs[active_key].append(next_text)
|
||
else:
|
||
continue_collecting = False
|
||
active_key = None
|
||
break
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred in process_matching_section: {e}")
|
||
# Optionally, you can re-raise the exception or handle it as needed
|
||
# raise e
|
||
|
||
if continue_collecting:
|
||
if text == '[$$table_start$$]':
|
||
current_index += 1
|
||
while (processed_paragraphs[current_index] != '[$$table_over$$]'):
|
||
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
|
||
current_index += 1
|
||
return current_index
|
||
if current_section_pattern and re.match(current_section_pattern, text):
|
||
continue_collecting = False
|
||
active_key = None
|
||
else:
|
||
if active_key is not None:
|
||
extracted_paragraphs[active_key].append(text)
|
||
return current_index
|
||
try:
|
||
if match_keywords(text, keywords):
|
||
active_key = text
|
||
extracted_paragraphs[active_key] = [text]
|
||
|
||
if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text) or re.match(pattern_letter_initial,text)) and len(text.replace(' ','')) < 10:
|
||
continue_collecting = True
|
||
process_matching_section()
|
||
elif match_keywords(text, follow_up_keywords):
|
||
continue_collecting = True
|
||
section_number = re.match(r'^(\d+([..]\d+)*)\s*[..、]?', text)
|
||
|
||
if section_number:
|
||
current_section_number = section_number.group(1)
|
||
level_count = current_section_number.count('.')
|
||
parts = current_section_number.split('.')
|
||
|
||
matched_patterns = []
|
||
if len(parts) > 1:
|
||
for i in range(1, 6):
|
||
parent_section_parts = parts[:-1].copy()
|
||
try:
|
||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
|
||
except ValueError:
|
||
print(f"ValueError: Unable to convert '{parent_section_parts[-1]}' to int.")
|
||
continue
|
||
parent_pattern = r'^' + r'\s*[..、]\s*'.join(parent_section_parts) + r'(?!\s*[..、]\s*\d+)'
|
||
matched_patterns.append(parent_pattern)
|
||
|
||
try:
|
||
current_top_level_num = int(current_section_number.split('.')[0])
|
||
for i in range(1, 6):
|
||
next_top_level_num = current_top_level_num + i
|
||
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..、](?!\d)'
|
||
if next_top_level_pattern not in matched_patterns:
|
||
matched_patterns.append(next_top_level_pattern)
|
||
except ValueError:
|
||
print(f"ValueError: Unable to convert '{current_section_number.split('.')[0]}' to int.")
|
||
|
||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||
try:
|
||
current_section_pattern = re.compile(combined_pattern)
|
||
except re.error as re_err:
|
||
print(f"Regex compilation error: {re_err} with pattern '{combined_pattern}'")
|
||
|
||
else:
|
||
process_matching_section() # Reusing the common function for matching sections
|
||
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred in extract_from_text: {e} 当前index:{current_index}")
|
||
|
||
return current_index
|
||
|
||
index = 0
|
||
while index < len(processed_paragraphs):
|
||
# print(processed_paragraphs[index].strip())
|
||
index = extract_from_text(processed_paragraphs[index].strip(), index)
|
||
# print("--------------")
|
||
index += 1
|
||
return extracted_paragraphs
|
||
|
||
# 分割表格中单元格文本
|
||
def split_cell_text(text):
|
||
# 定义用于提取括号内容的正则表达式,支持中英文括号
|
||
bracket_pattern = re.compile(r'[((][^(()))]+[))]')
|
||
|
||
# 1. 先提取并替换括号内容
|
||
bracket_contents = []
|
||
|
||
def replace_bracket_content(match):
|
||
bracket_contents.append(match.group(0)) # 保存括号内容
|
||
return f"<BRACKET_{len(bracket_contents) - 1}>" # 使用占位符替换括号内容
|
||
|
||
item_with_placeholders = bracket_pattern.sub(replace_bracket_content, text)
|
||
# print("-----------")
|
||
# print(text)
|
||
# 2. 分割句子,保证句子完整性(按标点符号和序号分割)
|
||
split_sentences = regex.split(
|
||
r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号后分割
|
||
r'(?<![A-Za-z]\s*)(?<!\d[..]?)(?=(?>\d+(?:[..]\d+)+)(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|' # 匹配多级编号,(?=(?>\d+(?:[..]\d+)+)让多级编号匹配“一口气”完成;限制后面不能是指定关键字
|
||
r'(?<![+\-×÷*/.\.A-Za-z]\s*|\d)(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|' # 数字后跟空格且不跟指定关键字时分割,且前面不包含 . 或 . eg:'1.1 xx'
|
||
r'(?<![+\-×÷*/.\.A-Za-z]\s*|\d)(?=\d+[、..](?!\d|(?:\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家])))|' # 数字后直接跟顿号、点号时分割,且点号后不跟数字 eg:'1.'
|
||
r'(?<![A-Za-z])(?=[A-Za-z][..]\s*(?![A-Za-z]))|' # 单个字母+点号或单个字母+数字,排除www.baidu.com 网址情况
|
||
r'(?=[A-Za-z]+\s*\d+\s*(?:[..]\s*\d+)*)|' # 在字母加数字或多级编号前分割
|
||
r'(?<=^|\n)(?=[一二三四五六七八九十]+、)|' # 在中文数字加顿号(如一、二、)前分割,(?<=^|\n)增加了行首限制
|
||
r'(?<=^|\n)(?=[①②③④⑤⑥⑦⑧⑨]+)',
|
||
item_with_placeholders
|
||
)
|
||
|
||
# 3. 还原括号内容
|
||
split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
|
||
split_sentences]
|
||
# 4. 过滤空字符串
|
||
split_sentences = [s for s in split_sentences if s.strip()]
|
||
# print(split_sentences)
|
||
return split_sentences
|
||
|
||
def extract_file_elements(file_path):
|
||
'''
|
||
文件预处理----按文件顺序提取文本和表格,并合并跨页表格
|
||
'''
|
||
doc = Document(file_path)
|
||
doc_elements = doc.element.body
|
||
doc_paragraphs = doc.paragraphs
|
||
doc_tables = doc.tables
|
||
pre_table_head = None
|
||
table_combine = False
|
||
paragraph_index = 0
|
||
tables_index = 0
|
||
doc_contents = []
|
||
|
||
pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
|
||
|
||
# 遍历文件元素
|
||
for element in doc_elements:
|
||
# 如果是段落
|
||
if element.tag.endswith('}p'):
|
||
try:
|
||
# Ensure we only process paragraphs if paragraph_index is within range
|
||
if paragraph_index < len(doc_paragraphs):
|
||
# print(f"Processing paragraph {paragraph_index}/{len(doc_paragraphs)}")
|
||
text = doc_paragraphs[paragraph_index].text
|
||
if pre_table_head:
|
||
# If we are still processing a table, check conditions
|
||
if (text == '' or pattern_marker.search(text)):
|
||
paragraph_index += 1
|
||
continue
|
||
else:
|
||
doc_contents.append('[$$table_over$$]')
|
||
table_combine = False
|
||
pre_table_head = None
|
||
doc_contents.append(doc_paragraphs[paragraph_index])
|
||
paragraph_index += 1
|
||
else:
|
||
raise IndexError(f"Paragraph index {paragraph_index} is out of range. Total paragraphs: {len(doc_paragraphs)}")
|
||
except IndexError as e:
|
||
print(f"Error processing paragraph: {e}")
|
||
continue # Skip to the next element if an error occurs
|
||
|
||
# 如果是表格
|
||
elif element.tag.endswith('}tbl'):
|
||
try:
|
||
if tables_index < len(doc_tables):
|
||
# print(f"Processing table {tables_index}/{len(doc_tables)}")
|
||
table = doc_tables[tables_index]
|
||
table_content = []
|
||
for row_idx, row in enumerate(table.rows):
|
||
if row_idx == 0:
|
||
if pre_table_head:
|
||
table_combine = True
|
||
if pre_table_head == row.cells[0].text:
|
||
continue
|
||
else:
|
||
pre_table_head = row.cells[0].text
|
||
doc_contents.append('[$$table_start$$]')
|
||
continue
|
||
for cell in row.cells:
|
||
cell_text = cell.text.strip()
|
||
if len(cell_text) > 8:
|
||
cell_text = split_cell_text(cell_text)
|
||
table_content += cell_text
|
||
if table_combine:
|
||
if doc_contents and table_content and not doc_contents[-1].endswith(('。', '!', '?', ';')):
|
||
doc_contents[-1] += ' ' + table_content[0]
|
||
table_content.pop(0)
|
||
doc_contents.extend(table_content)
|
||
tables_index += 1
|
||
else:
|
||
raise IndexError(f"Table index {tables_index} is out of range. Total tables: {len(doc_tables)}")
|
||
except IndexError as e:
|
||
print(f"Error processing table: {e}")
|
||
continue # Skip to the next element if an error occurs
|
||
|
||
return doc_contents
|
||
|
||
def handle_query(file_path, user_query, output_file, result_key, keywords):
|
||
try:
|
||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"]
|
||
follow_up_keywords = [
|
||
r'情\s*形\s*之\s*一',
|
||
r'情\s*况\s*之\s*一',
|
||
r'下\s*列(?!\s*公式)', # 增加负向前瞻,排除“下列公式”
|
||
r'以\s*下(?!\s*(?:公式|简称))', # 增加负向前瞻,排除“以下公式”
|
||
r'其\s*他.*?情\s*形\s*[::]',
|
||
r'包\s*括'
|
||
]
|
||
doc_contents = extract_file_elements(file_path)
|
||
processed_paragraphs = preprocess_paragraphs(doc_contents)
|
||
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
|
||
all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||
# print(all_texts2)
|
||
seen_contents = set() # 使用集合来跟踪已出现的内容
|
||
unique_all_texts1 = {} # 存储去重后的内容
|
||
for key, content in all_texts1.items():
|
||
content_key = content[:25] # 提取前25个字符作为去重依据
|
||
if content_key not in seen_contents:
|
||
unique_all_texts1[key] = content # 保留第一次出现的内容
|
||
seen_contents.add(content_key)
|
||
# 1. 得到有序的 all_text1_items
|
||
all_text1_items = sorted(unique_all_texts1.items(), key=lambda x: x[0])
|
||
# 2. 得到纯内容列表
|
||
all_texts1_list = [content for (_, content) in all_text1_items]
|
||
# Proceed only if there is content to write
|
||
selected_contents = {}
|
||
final_list=[f"未解析到'{result_key}'!"]
|
||
if all_texts1_list or all_texts2:
|
||
with open(output_file, 'w', encoding='utf-8') as file:
|
||
counter = 1
|
||
for content in all_texts1_list:
|
||
file.write(f"{counter}. {content}\n")
|
||
file.write("..............." + '\n')
|
||
counter += 1
|
||
|
||
# 生成用户查询
|
||
if not all_texts1_list:
|
||
print("无需调用大模型获取选中的序号!")
|
||
num_list=[]
|
||
else:
|
||
user_query = generate_full_user_query(output_file, user_query)
|
||
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
|
||
num_list = process_string_list(model_ans) # 处理模型返回的序号列表
|
||
print(result_key + "选中的序号:" + str(num_list))
|
||
for index in num_list:
|
||
if 1 <= index <= len(all_texts1_list):
|
||
original_global_idx = all_text1_items[index - 1][0]
|
||
content = all_text1_items[index - 1][1]
|
||
selected_contents[original_global_idx] = content
|
||
# 把选中的 all_text1 内容 + all_text2 合并
|
||
merged_dict = {}
|
||
# 先合并用户选中的 all_text1
|
||
for idx, txt in selected_contents.items():
|
||
merged_dict[idx] = txt
|
||
# 再合并 all_text2
|
||
for idx, txt in all_texts2.items():
|
||
merged_dict[idx] = txt
|
||
final_list = [txt for idx, txt in sorted(merged_dict.items(), key=lambda x: x[0])]
|
||
|
||
return {result_key: final_list}
|
||
except Exception as e:
|
||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||
return {result_key: [f"未解析到'{result_key}'!"]}
|
||
|
||
def combine_find_invalid(invalid_docpath, output_dir):
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
queries = [
|
||
(
|
||
r'否\s*决|'
|
||
r'无\s*效\s*投\s*标|'
|
||
r'无\s*效\s*文\s*件|'
|
||
r'(?:文\s*件|投\s*标|响\s*应)\s*[\u4e00-\u9fa5]?\s*(?:无|失)\s*效|'
|
||
r'无\s*效\s*响\s*应|'
|
||
r'无\s*效\s*报\s*价|'
|
||
r'无\s*效\s*标|'
|
||
r'视\s*为\s*无\s*效|'
|
||
r'被\s*拒\s*绝|'
|
||
r'将\s*拒\s*绝|'
|
||
r'予\s*以\s*拒\s*绝',
|
||
"""以下是从招标文件中摘取的内容,文本中序号分明,各信息之间以...............分割。
|
||
任务目标:
|
||
从文本中筛选所有描述否决投标,拒绝投标,投标、响应无效或类似表述的情况,并返回对应的序号。
|
||
要求与指南:
|
||
1.投标相关主体:包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
|
||
1.文本中可能存在无关的信息,请准确筛选符合条件的信息,即怎样的情况下,投标相关主体的投标将被否决、拒绝,作为无效标,或者是投标无效、响应无效等,请返回符合条件的信息的序号。
|
||
2.若条款内容包含'否决投标的情况说明'这样的笼统描述,而未说明具体的情形,则无需添返回该条款。
|
||
输出格式:
|
||
以 [x, x, x] 的形式返回,x 为符合条件的信息的序号,为自然数。无需额外返回解释与说明。
|
||
如果文本中没有符合条件的信息,请返回 []。
|
||
特殊情况:
|
||
如果某序号的内容明显分为几部分且一部分内容符合筛选条件,但其他部分明显是无关内容,请返回符合部分的字符串内容代替序号。
|
||
示例输出,仅供格式参考:
|
||
[1,3,4,6]
|
||
文本内容:{full_text}
|
||
""",
|
||
os.path.join(output_dir, "temp1.txt"),
|
||
"否决和无效投标情形"
|
||
),
|
||
(
|
||
r'废\s*标',
|
||
"""以下是从招标文件中摘取的内容,文本中序号分明,文本内之间的信息以'...............'分割。
|
||
任务目标:
|
||
请根据以下内容,筛选出 废标项的情况 (明确描述导致 废标 的情况)并返回对应的序号。
|
||
要求与指南:
|
||
文本中可能存在无关的信息,请准确筛选符合条件的信息,并将符合条件的信息的序号返回。
|
||
输出格式:
|
||
返回结果以 [x, x, x] 的形式,其中 x 为符合条件的信息的序号,为自然数。无需额外返回解释与说明。
|
||
如果文本中没有任何符合条件的废标情况,请返回 []。
|
||
示例输出,仅供格式参考:
|
||
[1,3,4,6]
|
||
文本内容:{full_text}
|
||
""",
|
||
os.path.join(output_dir, "temp2.txt"),
|
||
"废标项"
|
||
),
|
||
(
|
||
r'不\s*得(?!\s*(分|力))|禁\s*止\s*投\s*标',
|
||
"""以下是从招标文件中摘取的内容,文本中序号分明,文本内的条款以'...............'分割。条款规定了各方不得存在的情形。请根据以下要求进行筛选:
|
||
**投标相关主体与非投标相关主体的定义**:
|
||
投标相关主体:包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
|
||
非投标相关主体:包括但不限于“招标人”、“采购人”、“评标委员会”或其他描述非投标方的词语。
|
||
**筛选要求**:
|
||
1. **仅筛选**明确描述投标相关主体禁止情形或不得存在的情形的条款,不包含笼统或未具体说明情形的条款。例如:
|
||
若条款内容包含'投标人不得存在的其他关联情形'这样的笼统描述,而未说明具体的情形,则无需添加该条款。
|
||
2. **排除**仅描述非投标相关主体行为限制或禁止情形的条款,例如“招标人不得泄露信息”或“评标委员会不得收受贿赂”,则无需返回。
|
||
3. 若条款同时描述了对投标相关主体与非投标相关主体的行为限制、禁止情形,也需返回。
|
||
4. **特殊情况**:如果条款中包含“磋商小组”、”各方“等既能指代投标相关主体又能指代非投标相关主体的词汇:
|
||
若在语境中其指代或包含投标相关主体,则应将其考虑在内;否则,排除该条款。
|
||
|
||
**输出格式**:
|
||
返回结果以 [x, x, x] 的形式,其中 x 为符合条件的条款的序号,为自然数。无需额外返回解释与说明。
|
||
如果没有符合条件的条款,返回 `[]`。
|
||
**示例**:
|
||
- **符合条件**:
|
||
- `1. 投标人不得...` → 包含,返回序号 1。
|
||
- `3. 联合体投标各方不得...` → 包含,返回序号 3。
|
||
- **不符合条件**:
|
||
- `2. 采购人不得...` → 主语为“采购人”,排除。
|
||
-示例输出: [1,3]
|
||
请根据上述筛选要求,阅读以下文本内容,并返回符合条件的条款序号,
|
||
|
||
文本内容:{full_text}
|
||
""",
|
||
os.path.join(output_dir, "temp3.txt"),
|
||
"不得存在的情形"
|
||
)
|
||
]
|
||
results = []
|
||
|
||
# 使用线程池来并行处理查询
|
||
with ThreadPoolExecutor() as executor:
|
||
futures = []
|
||
for keywords, user_query, output_file, result_key in queries:
|
||
future = executor.submit(handle_query, invalid_docpath, user_query, output_file, result_key, keywords)
|
||
futures.append((future, result_key)) # 保持顺序
|
||
|
||
for future, result_key in futures:
|
||
try:
|
||
result = future.result()
|
||
except Exception as e:
|
||
print(f"线程处理 {result_key} 时出错: {e}")
|
||
result = {result_key: ""}
|
||
results.append(result)
|
||
combined_dict = {}
|
||
for d in results:
|
||
combined_dict.update(d)
|
||
|
||
print("无效标与废标done...")
|
||
return {"无效标与废标项": combined_dict}
|
||
|
||
if __name__ == '__main__':
|
||
start_time = time.time()
|
||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
||
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx"
|
||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||
pdf_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output6\招标文件(107国道)_invalid.pdf'
|
||
|
||
output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
|
||
# invalid_added = insert_mark(pdf_path)
|
||
# invalid_added_docx = pdf2docx(invalid_added)
|
||
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\355fbb20-e439-48dd-abdd-7645d424af36\invalid_added.docx'
|
||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||
end_time = time.time()
|
||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||
# # print("Elapsed time:", str(end_time - start_time)) |