zbparse/flask_app/general/无效标和废标公共代码.py
2024-11-13 09:41:37 +08:00

381 lines
20 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from docx import Document
# 完整读取文件中所有表格适合pdf转docx价格便宜的情况优先推荐内容完整
def read_tables_from_docx(file_path):
# 尝试打开文档
try:
doc = Document(file_path)
except Exception as e:
print(f"Error opening file: {e}")
return []
# 初始化列表来保存符合条件的单元格内容
cell_contents = []
# 读取文档中的所有表格
if not doc.tables:
print("No tables found in the document.")
return []
# 遍历文档中的每个表格
for table_idx, table in enumerate(doc.tables):
# 遍历表格中的每一行
for row_idx, row in enumerate(table.rows):
# 遍历每一行中的单元格
for cell in row.cells:
cell_text = cell.text.strip() # 去除单元格内容前后空白
if len(cell_text) > 8: # 检查文字数量是否大于5
cell_contents.append(cell_text)
# 返回符合条件的单元格内容
return cell_contents
#处理跨页的段落
def preprocess_paragraphs(paragraphs):
processed = [] # 初始化处理后的段落列表
index = 0
# 定义两个新的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
#排除遇到表格、填空的情况
def has_long_spaces(text, max_space_count=5):
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
while index < len(paragraphs):
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
# 检查当前段落是否匹配任一排除模式
if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text):
# 如果匹配则跳过当前段落不添加到processed列表中
index += 1
continue
# 检查当前段落是否为空
if current_text == '':
# 确保有前一个和后一个段落
if 0 < index < len(paragraphs) - 1:
prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本
next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本
# **新增部分:检查前一个段落和后一个段落都非空** 内部没有超过5个连续空格
if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
# 检查前一个段落的文本是否不以标点符号结尾
if not prev_text.endswith(('', ',', '', '!', '?')):
# 定义列表项的模式
list_item_pattern = r'^\s*([\(]\d+[\)]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
# 检查后一个段落是否以列表模式开头
is_next_list = re.match(list_item_pattern, next_text)
# 如果后一个段落不是列表项且前一个段落长度大于30
if not is_next_list and len(prev_text) > 30:
# 合并前一个和后一个段落的文本
merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
if processed:
# 更新处理后的最后一个段落
processed[-1] = merged_text
else:
# 如果列表为空,直接添加合并后的文本
processed.append(merged_text)
# 跳过下一个段落,因为它已经被合并
index += 2
continue
# 如果没有满足条件,不进行合并,跳过当前空段落
else:
# 非空段落,添加到处理后的列表中
processed.append(current_text)
index += 1
return processed
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
from collections import OrderedDict
from docx import Document
import re
if isinstance(keywords, str):
keywords = [keywords]
doc = Document(doc_path)
extracted_paragraphs = OrderedDict()
continue_collecting = False
current_section_pattern = None
active_key = None
def match_keywords(text, patterns):
# 首先检查关键词是否匹配
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
# 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分'
if '\s*得' in pattern: # 使用字符串匹配检查模式
post_match_text = text[match.end():].strip()
if post_match_text.startswith(""):
continue # 如果是"不得分",跳过这个匹配
return True
return False
def extract_from_text(text, current_index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "":
return current_index
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
return current_index
if match_keywords(text, keywords):
active_key = text
extracted_paragraphs[active_key] = [text]
if match_keywords(text, follow_up_keywords):
continue_collecting = True
section_number = re.match(r'^(\d+([.]\d+)*)\s*[.]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.'
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
# Pattern to match current level, e.g., 3.4.5
pattern = r'^' + (r'\d+\s*[.]\s*') * level_count + r'\d+'
# Generate patterns for next section at same level and parent level
parts = current_section_number.split('.')
matched_patterns = [pattern] # start with the full pattern
# Next section at same level
parts[-1] = str(int(parts[-1]) + 1)
# next_pattern = r'^' + r'\.\s*'.join(parts) #11.12以前版本
next_pattern = r'^' + r'\s*[.]\s*'.join(parts)
matched_patterns.append(next_pattern)
# Parent section (if applicable)
if len(parts) > 1:
parent_section_parts = parts[:-1]
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
parent_pattern = r'^' + r'\s*[.]\s*'.join(parent_section_parts)
matched_patterns.append(parent_pattern)
# 添加对 '数字 、' 格式的支持
digit_comma_pattern = r'^\d+\s*、'
matched_patterns.append(digit_comma_pattern)
# Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern)
else:
found_next_number = False
current_section_pattern = None
while current_index < len(doc.paragraphs) - 1:
current_index += 1
next_text = doc.paragraphs[current_index].text.strip()
if not found_next_number:
# 修改后的正则,支持 '数字 、' 格式
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'[.]'.join(
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\d+[\)\]'
elif next_section_number.group(3):
dynamic_pattern = r'^\d+\s*、'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key = None
break
return current_index
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
index = 0
while index < len(processed_paragraphs):
index = extract_from_text(processed_paragraphs[index].strip(), index)
index += 1
return extracted_paragraphs
"""
eg:
text_list = ["这是第一句。 1. 接下来是第二句! (3) 最后一句。"]
new_text_list = ["这是第一句。", "1. 接下来是第二句!", "(3) 最后一句。"]
"""
def preprocess_text_list(text_list):
new_text_list = []
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\[1-9]\d*\)')
for text in text_list:
# 使用正则表达式检查并拆分元素
parts = split_pattern.split(text)
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
return new_text_list
def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达式提取到的东西格式化
all_texts1 = []
all_texts2 = []
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
split_pattern = r'(?<=[。!?\!\?])'
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
# print(text_list)
# print("------------------")
for data in text_list:
# print(data)
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号eg:1 | (1) |2 | 1. | 2全角点| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|)?|[一二三四五六七八九十]+、)'
data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data)
if keyword_match:
# 从关键词位置开始查找结束标点符号
start_pos = keyword_match.start()
# 截取从关键词开始到后面的内容
substring = data[start_pos:]
# 按定义的结束标点分割
sentences = re.split(split_pattern, substring, 1)
if len(sentences) > 0 and sentences[0]:
# 只取第一句,保留标点
cleaned_text = data[:start_pos] + sentences[0] # eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
else:
cleaned_text = data # 如果没有标点,使用整个字符串
else:
# 如果没有找到关键词,保留原文本
cleaned_text = data
# 删除空格
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
# 如果长度大于8则添加到结果列表
if len(cleaned_text_no_spaces) > 8:
all_texts1.append(cleaned_text_no_spaces)
else:
# print(text_list)
# print("*********")
new_text_list = preprocess_text_list(text_list)
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|)?|[一二三四五六七八九十]+、)'
data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号
# 将修改后的第一个元素和剩余的元素连接起来
new_text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
# 删除空格
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中
return all_texts1, all_texts2 # all_texts1要额外用gpt all_text2直接返回结果
def extract_table_with_keywords(data, keywords, follow_up_keywords):
"""遍历列表中的每个元素,查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
sentences1 = [] # 保存没有后续关键词的情况
sentences2 = [] # 保存有后续关键词的情况
# 编译关键词的正则表达式,提高匹配性能
keywords_pattern = re.compile(keywords, re.IGNORECASE)
follow_up_patterns = [re.compile(fu, re.IGNORECASE) for fu in follow_up_keywords]
# 检查是否包含 '无效报价' 的关键词
check_invalid_bidding = bool(re.search(r'\s*效\s*报\s*价', keywords, re.IGNORECASE))
# 定义用于提取括号内容的正则表达式,支持中英文括号
bracket_pattern = re.compile(r'[(][^())]+[)]')
# 遍历列表中的每个字符串元素
for item in data:
# 只有在 keywords 包含 '无效报价' 时,才检查 "无效报价"
if check_invalid_bidding and re.search(r'\s*效\s*报\s*价', item, re.IGNORECASE):
sentences1.append(item.strip())
continue
# 先检查 item 是否包含任意关键词,如果不包含,则跳过分割
if not keywords_pattern.search(item):
continue
# 1. 先提取并替换括号内容
bracket_contents = []
def replace_bracket_content(match):
bracket_contents.append(match.group(0)) # 保存括号内容
return f"<BRACKET_{len(bracket_contents) - 1}>" # 使用占位符替换括号内容
item_with_placeholders = bracket_pattern.sub(replace_bracket_content, item)
# 2. 分割句子,保证句子完整性(按标点符号和序号分割)
split_sentences = re.split(
r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号或分号后面分割
r'(?=\d+[.]\d+)|' # 在类似1.1的数字序号前分割
r'(?=\d+\s(?![号条款节章项例页段部步点年月日时分秒]))|' # 数字后面跟空格且空格后面不是指定关键字时分割
r'(?=\d+[、.])|' # 在数字后直接跟顿号、半角点号或全角点号时分割
r'(?=[A-Za-z][.]\s*)|' # 在字母加点如A.、a.)前分割
r'(?=[A-Za-z]+\s*\d+\s*(?:[.]\s*\d+)*)|' # 在可选字母加数字或多级编号前分割
r'(?=[一二三四五六七八九十]+、)', # 在中文数字加顿号(如一、二、)前分割
item_with_placeholders
)
# 3. 还原括号内容
split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
split_sentences]
# 接下来是处理包含和不包含后续关键词的情况
i = 0
while i < len(split_sentences):
sentence = split_sentences[i].strip()
if keywords_pattern.search(sentence):
# 处理 "不\s*得" 的特殊情况,跳过包含 "不得分" 的句子
not_allowed_match = re.search(r'\s*得', sentence, re.IGNORECASE)
if not_allowed_match:
post_match_text = sentence[not_allowed_match.end():].strip()
if post_match_text.startswith(""):
i += 1 # 跳过这个句子
continue
# 检查是否存在后续关键词
follow_up_present = any(fp.search(sentence) for fp in follow_up_patterns)
if follow_up_present:
# 如果存在后续关键词,则从当前位置开始截取
start_index = i
end_index = start_index
found_next_section = False
for j in range(start_index + 1, len(split_sentences)):
if re.match(r'\d+[.]\d+([.]\d+)?', split_sentences[j].strip()):
end_index = j
found_next_section = True
break
if found_next_section:
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
else:
full_text = ' '.join(split_sentences[start_index:]).strip()
# 清洗文本,去除前缀编号等
pattern = r'^\s*([(]\d+[)]|[A-Za-z][.]\s*|[A-Za-z]?\d+\s*([.]\s*\d+)*(\s|[.]|、|)?|[一二三四五六七八九十]+、)'
full_text = re.sub(pattern, '', full_text).replace(' ', '').strip()
sentences2.append(full_text) # 存储有后续关键词的情况
i = end_index if found_next_section else len(split_sentences)
else:
# 没有后续关键词的情况
pattern = r'^\s*([(]\d+[)]|[A-Za-z][.]\s*|[A-Za-z]?\d+\s*([.]\s*\d+)*(\s|[.]|、|)?|[一二三四五六七八九十]+、)'
cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ', '').strip()
if len(cleaned_sentence) > 8:
sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况
i += 1
else:
i += 1
return sentences1, sentences2 # 返回两个列表