zbparse/flask_app/general/无效标和废标公共代码.py

374 lines
19 KiB
Python
Raw Normal View History

2024-11-12 17:12:47 +08:00
import re
from docx import Document
# 完整读取文件中所有表格适合pdf转docx价格便宜的情况优先推荐内容完整
def read_tables_from_docx(file_path):
# 尝试打开文档
try:
doc = Document(file_path)
except Exception as e:
print(f"Error opening file: {e}")
return []
# 初始化列表来保存符合条件的单元格内容
cell_contents = []
# 读取文档中的所有表格
if not doc.tables:
print("No tables found in the document.")
return []
# 遍历文档中的每个表格
for table_idx, table in enumerate(doc.tables):
# 遍历表格中的每一行
for row_idx, row in enumerate(table.rows):
# 遍历每一行中的单元格
for cell in row.cells:
cell_text = cell.text.strip() # 去除单元格内容前后空白
if len(cell_text) > 8: # 检查文字数量是否大于5
cell_contents.append(cell_text)
# 返回符合条件的单元格内容
return cell_contents
#处理跨页的段落
def preprocess_paragraphs(paragraphs):
processed = [] # 初始化处理后的段落列表
index = 0
#排除遇到表格、填空的情况
def has_long_spaces(text, max_space_count=5):
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
while index < len(paragraphs):
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
# 检查当前段落是否为空
if current_text == '':
# 确保有前一个和后一个段落
if 0 < index < len(paragraphs) - 1:
prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本
next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本
# **新增部分:检查前一个段落和后一个段落都非空** 内部没有超过5个连续空格
if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
# 检查前一个段落的文本是否不以标点符号结尾
if not prev_text.endswith(('', ',', '', '!', '?')):
# 定义列表项的模式
list_item_pattern = r'^\s*([\(]\d+[\)]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
# 检查后一个段落是否以列表模式开头
is_next_list = re.match(list_item_pattern, next_text)
# 如果后一个段落不是列表项且前一个段落长度大于30
if not is_next_list and len(prev_text) > 30:
# 合并前一个和后一个段落的文本
merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
if processed:
# 更新处理后的最后一个段落
processed[-1] = merged_text
else:
# 如果列表为空,直接添加合并后的文本
processed.append(merged_text)
# 跳过下一个段落,因为它已经被合并
index += 2
continue
# 如果没有满足条件,不进行合并,跳过当前空段落
else:
# 非空段落,添加到处理后的列表中
processed.append(current_text)
index += 1
return processed
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
from collections import OrderedDict
from docx import Document
import re
if isinstance(keywords, str):
keywords = [keywords]
doc = Document(doc_path)
extracted_paragraphs = OrderedDict()
continue_collecting = False
current_section_pattern = None
active_key = None
def match_keywords(text, patterns):
# 首先检查关键词是否匹配
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
# 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分'
if '\s*得' in pattern: # 使用字符串匹配检查模式
post_match_text = text[match.end():].strip()
if post_match_text.startswith(""):
continue # 如果是"不得分",跳过这个匹配
return True
return False
def extract_from_text(text, current_index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "":
return current_index
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
return current_index
if match_keywords(text, keywords):
active_key = text
extracted_paragraphs[active_key] = [text]
if match_keywords(text, follow_up_keywords):
continue_collecting = True
section_number = re.match(r'^(\d+([.]\d+)*)\s*[.]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.'
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
# Pattern to match current level, e.g., 3.4.5
pattern = r'^' + (r'\d+\s*[.]\s*') * level_count + r'\d+'
# Generate patterns for next section at same level and parent level
parts = current_section_number.split('.')
matched_patterns = [pattern] # start with the full pattern
# Next section at same level
parts[-1] = str(int(parts[-1]) + 1)
# next_pattern = r'^' + r'\.\s*'.join(parts) #11.12以前版本
next_pattern = r'^' + r'\s*[.]\s*'.join(parts)
matched_patterns.append(next_pattern)
# Parent section (if applicable)
if len(parts) > 1:
parent_section_parts = parts[:-1]
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
parent_pattern = r'^' + r'\s*[.]\s*'.join(parent_section_parts)
matched_patterns.append(parent_pattern)
# 添加对 '数字 、' 格式的支持
digit_comma_pattern = r'^\d+\s*、'
matched_patterns.append(digit_comma_pattern)
# Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern)
else:
found_next_number = False
current_section_pattern = None
while current_index < len(doc.paragraphs) - 1:
current_index += 1
next_text = doc.paragraphs[current_index].text.strip()
if not found_next_number:
# 修改后的正则,支持 '数字 、' 格式
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'[.]'.join(
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\d+[\)\]'
elif next_section_number.group(3):
dynamic_pattern = r'^\d+\s*、'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key = None
break
return current_index
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
index = 0
while index < len(processed_paragraphs):
index = extract_from_text(processed_paragraphs[index].strip(), index)
index += 1
return extracted_paragraphs
"""
eg:
text_list = ["这是第一句。 1. 接下来是第二句! (3) 最后一句。"]
new_text_list = ["这是第一句。", "1. 接下来是第二句!", "(3) 最后一句。"]
"""
def preprocess_text_list(text_list):
new_text_list = []
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\[1-9]\d*\)')
for text in text_list:
# 使用正则表达式检查并拆分元素
parts = split_pattern.split(text)
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
return new_text_list
def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达式提取到的东西格式化
all_texts1 = []
all_texts2 = []
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
split_pattern = r'(?<=[。!?\!\?])'
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
# print(text_list)
# print("------------------")
for data in text_list:
# print(data)
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号eg:1 | (1) |2 | 1. | 2全角点| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|)?|[一二三四五六七八九十]+、)'
data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data)
if keyword_match:
# 从关键词位置开始查找结束标点符号
start_pos = keyword_match.start()
# 截取从关键词开始到后面的内容
substring = data[start_pos:]
# 按定义的结束标点分割
sentences = re.split(split_pattern, substring, 1)
if len(sentences) > 0 and sentences[0]:
# 只取第一句,保留标点
cleaned_text = data[:start_pos] + sentences[0] # eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
else:
cleaned_text = data # 如果没有标点,使用整个字符串
else:
# 如果没有找到关键词,保留原文本
cleaned_text = data
# 删除空格
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
# 如果长度大于8则添加到结果列表
if len(cleaned_text_no_spaces) > 8:
all_texts1.append(cleaned_text_no_spaces)
else:
# print(text_list)
# print("*********")
new_text_list = preprocess_text_list(text_list)
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|)?|[一二三四五六七八九十]+、)'
data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号
# 将修改后的第一个元素和剩余的元素连接起来
new_text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
# 删除空格
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中
return all_texts1, all_texts2 # all_texts1要额外用gpt all_text2直接返回结果
def extract_table_with_keywords(data, keywords, follow_up_keywords):
"""遍历列表中的每个元素,查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
sentences1 = [] # 保存没有后续关键词的情况
sentences2 = [] # 保存有后续关键词的情况
# 编译关键词的正则表达式,提高匹配性能
keywords_pattern = re.compile(keywords, re.IGNORECASE)
follow_up_patterns = [re.compile(fu, re.IGNORECASE) for fu in follow_up_keywords]
# 检查是否包含 '无效报价' 的关键词
check_invalid_bidding = bool(re.search(r'\s*效\s*报\s*价', keywords, re.IGNORECASE))
# 定义用于提取括号内容的正则表达式,支持中英文括号
bracket_pattern = re.compile(r'[(][^())]+[)]')
# 遍历列表中的每个字符串元素
for item in data:
# 只有在 keywords 包含 '无效报价' 时,才检查 "无效报价"
if check_invalid_bidding and re.search(r'\s*效\s*报\s*价', item, re.IGNORECASE):
sentences1.append(item.strip())
continue
# 先检查 item 是否包含任意关键词,如果不包含,则跳过分割
if not keywords_pattern.search(item):
continue
# 1. 先提取并替换括号内容
bracket_contents = []
def replace_bracket_content(match):
bracket_contents.append(match.group(0)) # 保存括号内容
return f"<BRACKET_{len(bracket_contents) - 1}>" # 使用占位符替换括号内容
item_with_placeholders = bracket_pattern.sub(replace_bracket_content, item)
# 2. 分割句子,保证句子完整性(按标点符号和序号分割)
split_sentences = re.split(
r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号或分号后面分割
r'(?=\d+[.]\d+)|' # 在类似1.1的数字序号前分割
r'(?=\d+\s(?![号条款节章项例页段部步点年月日时分秒]))|' # 数字后面跟空格且空格后面不是指定关键字时分割
r'(?=\d+[、.])|' # 在数字后直接跟顿号、半角点号或全角点号时分割
r'(?=[A-Za-z][.]\s*)|' # 在字母加点如A.、a.)前分割
r'(?=[A-Za-z]+\s*\d+\s*(?:[.]\s*\d+)*)|' # 在可选字母加数字或多级编号前分割
r'(?=[一二三四五六七八九十]+、)', # 在中文数字加顿号(如一、二、)前分割
item_with_placeholders
)
# 3. 还原括号内容
split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
split_sentences]
# 接下来是处理包含和不包含后续关键词的情况
i = 0
while i < len(split_sentences):
sentence = split_sentences[i].strip()
if keywords_pattern.search(sentence):
# 处理 "不\s*得" 的特殊情况,跳过包含 "不得分" 的句子
not_allowed_match = re.search(r'\s*得', sentence, re.IGNORECASE)
if not_allowed_match:
post_match_text = sentence[not_allowed_match.end():].strip()
if post_match_text.startswith(""):
i += 1 # 跳过这个句子
continue
# 检查是否存在后续关键词
follow_up_present = any(fp.search(sentence) for fp in follow_up_patterns)
if follow_up_present:
# 如果存在后续关键词,则从当前位置开始截取
start_index = i
end_index = start_index
found_next_section = False
for j in range(start_index + 1, len(split_sentences)):
if re.match(r'\d+[.]\d+([.]\d+)?', split_sentences[j].strip()):
end_index = j
found_next_section = True
break
if found_next_section:
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
else:
full_text = ' '.join(split_sentences[start_index:]).strip()
# 清洗文本,去除前缀编号等
pattern = r'^\s*([(]\d+[)]|[A-Za-z][.]\s*|[A-Za-z]?\d+\s*([.]\s*\d+)*(\s|[.]|、|)?|[一二三四五六七八九十]+、)'
full_text = re.sub(pattern, '', full_text).replace(' ', '').strip()
sentences2.append(full_text) # 存储有后续关键词的情况
i = end_index if found_next_section else len(split_sentences)
else:
# 没有后续关键词的情况
pattern = r'^\s*([(]\d+[)]|[A-Za-z][.]\s*|[A-Za-z]?\d+\s*([.]\s*\d+)*(\s|[.]|、|)?|[一二三四五六七八九十]+、)'
cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ', '').strip()
if len(cleaned_sentence) > 8:
sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况
i += 1
else:
i += 1
return sentences1, sentences2 # 返回两个列表