374 lines
19 KiB
Python
374 lines
19 KiB
Python
|
import re
|
|||
|
|
|||
|
from docx import Document
|
|||
|
# 完整读取文件中所有表格(适合pdf转docx价格便宜的情况,优先推荐,内容完整)
|
|||
|
def read_tables_from_docx(file_path):
|
|||
|
# 尝试打开文档
|
|||
|
try:
|
|||
|
doc = Document(file_path)
|
|||
|
except Exception as e:
|
|||
|
print(f"Error opening file: {e}")
|
|||
|
return []
|
|||
|
|
|||
|
# 初始化列表来保存符合条件的单元格内容
|
|||
|
cell_contents = []
|
|||
|
|
|||
|
# 读取文档中的所有表格
|
|||
|
if not doc.tables:
|
|||
|
print("No tables found in the document.")
|
|||
|
return []
|
|||
|
|
|||
|
# 遍历文档中的每个表格
|
|||
|
for table_idx, table in enumerate(doc.tables):
|
|||
|
# 遍历表格中的每一行
|
|||
|
for row_idx, row in enumerate(table.rows):
|
|||
|
# 遍历每一行中的单元格
|
|||
|
for cell in row.cells:
|
|||
|
cell_text = cell.text.strip() # 去除单元格内容前后空白
|
|||
|
if len(cell_text) > 8: # 检查文字数量是否大于5
|
|||
|
cell_contents.append(cell_text)
|
|||
|
|
|||
|
# 返回符合条件的单元格内容
|
|||
|
return cell_contents
|
|||
|
|
|||
|
|
|||
|
#处理跨页的段落
|
|||
|
def preprocess_paragraphs(paragraphs):
|
|||
|
processed = [] # 初始化处理后的段落列表
|
|||
|
index = 0
|
|||
|
#排除遇到表格、填空的情况
|
|||
|
def has_long_spaces(text, max_space_count=5):
|
|||
|
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
|||
|
while index < len(paragraphs):
|
|||
|
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
|||
|
|
|||
|
# 检查当前段落是否为空
|
|||
|
if current_text == '':
|
|||
|
# 确保有前一个和后一个段落
|
|||
|
if 0 < index < len(paragraphs) - 1:
|
|||
|
prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本
|
|||
|
next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本
|
|||
|
|
|||
|
# **新增部分:检查前一个段落和后一个段落都非空** 内部没有超过5个连续空格
|
|||
|
if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
|
|||
|
# 检查前一个段落的文本是否不以标点符号结尾
|
|||
|
if not prev_text.endswith((',', ',', '。', '!', '?')):
|
|||
|
# 定义列表项的模式
|
|||
|
list_item_pattern = r'^\s*([(\(]\d+[)\)]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
|
|||
|
|
|||
|
# 检查后一个段落是否以列表模式开头
|
|||
|
is_next_list = re.match(list_item_pattern, next_text)
|
|||
|
|
|||
|
# 如果后一个段落不是列表项,且前一个段落长度大于30
|
|||
|
if not is_next_list and len(prev_text) > 30:
|
|||
|
# 合并前一个和后一个段落的文本
|
|||
|
merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
|
|||
|
|
|||
|
if processed:
|
|||
|
# 更新处理后的最后一个段落
|
|||
|
processed[-1] = merged_text
|
|||
|
else:
|
|||
|
# 如果列表为空,直接添加合并后的文本
|
|||
|
processed.append(merged_text)
|
|||
|
|
|||
|
# 跳过下一个段落,因为它已经被合并
|
|||
|
index += 2
|
|||
|
continue
|
|||
|
# 如果没有满足条件,不进行合并,跳过当前空段落
|
|||
|
else:
|
|||
|
# 非空段落,添加到处理后的列表中
|
|||
|
processed.append(current_text)
|
|||
|
|
|||
|
index += 1
|
|||
|
|
|||
|
return processed
|
|||
|
|
|||
|
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
|||
|
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
|||
|
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
|||
|
from collections import OrderedDict
|
|||
|
from docx import Document
|
|||
|
import re
|
|||
|
|
|||
|
if isinstance(keywords, str):
|
|||
|
keywords = [keywords]
|
|||
|
|
|||
|
doc = Document(doc_path)
|
|||
|
extracted_paragraphs = OrderedDict()
|
|||
|
continue_collecting = False
|
|||
|
current_section_pattern = None
|
|||
|
active_key = None
|
|||
|
|
|||
|
def match_keywords(text, patterns):
|
|||
|
# 首先检查关键词是否匹配
|
|||
|
for pattern in patterns:
|
|||
|
match = re.search(pattern, text, re.IGNORECASE)
|
|||
|
if match:
|
|||
|
# 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分'
|
|||
|
if '不\s*得' in pattern: # 使用字符串匹配检查模式
|
|||
|
post_match_text = text[match.end():].strip()
|
|||
|
if post_match_text.startswith("分"):
|
|||
|
continue # 如果是"不得分",跳过这个匹配
|
|||
|
return True
|
|||
|
return False
|
|||
|
|
|||
|
def extract_from_text(text, current_index):
|
|||
|
nonlocal continue_collecting, current_section_pattern, active_key
|
|||
|
if text == "":
|
|||
|
return current_index
|
|||
|
|
|||
|
if continue_collecting:
|
|||
|
if current_section_pattern and re.match(current_section_pattern, text):
|
|||
|
continue_collecting = False
|
|||
|
active_key = None
|
|||
|
else:
|
|||
|
if active_key is not None:
|
|||
|
extracted_paragraphs[active_key].append(text)
|
|||
|
return current_index
|
|||
|
|
|||
|
if match_keywords(text, keywords):
|
|||
|
active_key = text
|
|||
|
extracted_paragraphs[active_key] = [text]
|
|||
|
if match_keywords(text, follow_up_keywords):
|
|||
|
continue_collecting = True
|
|||
|
section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.'
|
|||
|
if section_number:
|
|||
|
current_section_number = section_number.group(1)
|
|||
|
level_count = current_section_number.count('.')
|
|||
|
|
|||
|
# Pattern to match current level, e.g., 3.4.5
|
|||
|
pattern = r'^' + (r'\d+\s*[..]\s*') * level_count + r'\d+'
|
|||
|
# Generate patterns for next section at same level and parent level
|
|||
|
parts = current_section_number.split('.')
|
|||
|
matched_patterns = [pattern] # start with the full pattern
|
|||
|
|
|||
|
# Next section at same level
|
|||
|
parts[-1] = str(int(parts[-1]) + 1)
|
|||
|
# next_pattern = r'^' + r'\.\s*'.join(parts) #11.12以前版本
|
|||
|
next_pattern = r'^' + r'\s*[..]\s*'.join(parts)
|
|||
|
matched_patterns.append(next_pattern)
|
|||
|
|
|||
|
# Parent section (if applicable)
|
|||
|
if len(parts) > 1:
|
|||
|
parent_section_parts = parts[:-1]
|
|||
|
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
|
|||
|
parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts)
|
|||
|
matched_patterns.append(parent_pattern)
|
|||
|
|
|||
|
# 添加对 '数字 、' 格式的支持
|
|||
|
digit_comma_pattern = r'^\d+\s*、'
|
|||
|
matched_patterns.append(digit_comma_pattern)
|
|||
|
# Combine the patterns
|
|||
|
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
|||
|
current_section_pattern = re.compile(combined_pattern)
|
|||
|
|
|||
|
else:
|
|||
|
found_next_number = False
|
|||
|
current_section_pattern = None
|
|||
|
|
|||
|
while current_index < len(doc.paragraphs) - 1:
|
|||
|
current_index += 1
|
|||
|
next_text = doc.paragraphs[current_index].text.strip()
|
|||
|
if not found_next_number:
|
|||
|
# 修改后的正则,支持 '数字 、' 格式
|
|||
|
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
|
|||
|
next_text)
|
|||
|
if next_section_number:
|
|||
|
found_next_number = True
|
|||
|
if next_section_number.group(1):
|
|||
|
section_parts = next_section_number.group(1).split('.')
|
|||
|
dynamic_pattern = r'^' + r'[..]'.join(
|
|||
|
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
|||
|
elif next_section_number.group(2):
|
|||
|
dynamic_pattern = r'^[\(\(]\d+[\)\)]'
|
|||
|
elif next_section_number.group(3):
|
|||
|
dynamic_pattern = r'^\d+\s*、'
|
|||
|
current_section_pattern = re.compile(dynamic_pattern)
|
|||
|
if current_section_pattern and re.match(current_section_pattern, next_text):
|
|||
|
extracted_paragraphs[active_key].append(next_text)
|
|||
|
else:
|
|||
|
continue_collecting = False
|
|||
|
active_key = None
|
|||
|
break
|
|||
|
|
|||
|
return current_index
|
|||
|
|
|||
|
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
|
|||
|
index = 0
|
|||
|
while index < len(processed_paragraphs):
|
|||
|
index = extract_from_text(processed_paragraphs[index].strip(), index)
|
|||
|
index += 1
|
|||
|
|
|||
|
return extracted_paragraphs
|
|||
|
|
|||
|
"""
|
|||
|
eg:
|
|||
|
text_list = ["这是第一句。 1. 接下来是第二句! (3) 最后一句。"]
|
|||
|
new_text_list = ["这是第一句。", "1. 接下来是第二句!", "(3) 最后一句。"]
|
|||
|
"""
|
|||
|
def preprocess_text_list(text_list):
|
|||
|
new_text_list = []
|
|||
|
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
|
|||
|
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))')
|
|||
|
for text in text_list:
|
|||
|
# 使用正则表达式检查并拆分元素
|
|||
|
parts = split_pattern.split(text)
|
|||
|
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
|
|||
|
|
|||
|
return new_text_list
|
|||
|
|
|||
|
def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达式提取到的东西格式化
|
|||
|
all_texts1 = []
|
|||
|
all_texts2 = []
|
|||
|
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
|
|||
|
split_pattern = r'(?<=[。!?\!\?])'
|
|||
|
|
|||
|
for key, text_list in extracted_contents.items():
|
|||
|
if len(text_list) == 1:
|
|||
|
# print(text_list)
|
|||
|
# print("------------------")
|
|||
|
for data in text_list:
|
|||
|
# print(data)
|
|||
|
# 检查是否包含任何需要排除的字符串
|
|||
|
if any(exclude in data for exclude in excludes):
|
|||
|
continue # 如果包含任何排除字符串,跳过这个数据
|
|||
|
# 去掉开头的序号,eg:1 | (1) |(2) | 1. | 2.(全角点)| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
|
|||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
|||
|
data = re.sub(pattern, '', data).strip()
|
|||
|
keyword_match = re.search(keywords, data)
|
|||
|
if keyword_match:
|
|||
|
# 从关键词位置开始查找结束标点符号
|
|||
|
start_pos = keyword_match.start()
|
|||
|
# 截取从关键词开始到后面的内容
|
|||
|
substring = data[start_pos:]
|
|||
|
# 按定义的结束标点分割
|
|||
|
sentences = re.split(split_pattern, substring, 1)
|
|||
|
if len(sentences) > 0 and sentences[0]:
|
|||
|
# 只取第一句,保留标点
|
|||
|
cleaned_text = data[:start_pos] + sentences[0] # eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
|
|||
|
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
|
|||
|
else:
|
|||
|
cleaned_text = data # 如果没有标点,使用整个字符串
|
|||
|
else:
|
|||
|
# 如果没有找到关键词,保留原文本
|
|||
|
cleaned_text = data
|
|||
|
# 删除空格
|
|||
|
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
|
|||
|
# 如果长度大于8,则添加到结果列表
|
|||
|
if len(cleaned_text_no_spaces) > 8:
|
|||
|
all_texts1.append(cleaned_text_no_spaces)
|
|||
|
|
|||
|
else:
|
|||
|
# print(text_list)
|
|||
|
# print("*********")
|
|||
|
new_text_list = preprocess_text_list(text_list)
|
|||
|
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
|
|||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
|||
|
|
|||
|
data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号
|
|||
|
# 将修改后的第一个元素和剩余的元素连接起来
|
|||
|
new_text_list[0] = data # 更新列表中的第一个元素
|
|||
|
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
|
|||
|
# 删除空格
|
|||
|
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
|
|||
|
all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中
|
|||
|
|
|||
|
return all_texts1, all_texts2 # all_texts1要额外用gpt all_text2直接返回结果
|
|||
|
|
|||
|
|
|||
|
def extract_table_with_keywords(data, keywords, follow_up_keywords):
|
|||
|
"""遍历列表中的每个元素,查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
|||
|
sentences1 = [] # 保存没有后续关键词的情况
|
|||
|
sentences2 = [] # 保存有后续关键词的情况
|
|||
|
|
|||
|
# 编译关键词的正则表达式,提高匹配性能
|
|||
|
keywords_pattern = re.compile(keywords, re.IGNORECASE)
|
|||
|
follow_up_patterns = [re.compile(fu, re.IGNORECASE) for fu in follow_up_keywords]
|
|||
|
|
|||
|
# 检查是否包含 '无效报价' 的关键词
|
|||
|
check_invalid_bidding = bool(re.search(r'无\s*效\s*报\s*价', keywords, re.IGNORECASE))
|
|||
|
|
|||
|
# 定义用于提取括号内容的正则表达式,支持中英文括号
|
|||
|
bracket_pattern = re.compile(r'[((][^(()))]+[))]')
|
|||
|
|
|||
|
# 遍历列表中的每个字符串元素
|
|||
|
for item in data:
|
|||
|
# 只有在 keywords 包含 '无效报价' 时,才检查 "无效报价"
|
|||
|
if check_invalid_bidding and re.search(r'无\s*效\s*报\s*价', item, re.IGNORECASE):
|
|||
|
sentences1.append(item.strip())
|
|||
|
continue
|
|||
|
|
|||
|
# 先检查 item 是否包含任意关键词,如果不包含,则跳过分割
|
|||
|
if not keywords_pattern.search(item):
|
|||
|
continue
|
|||
|
|
|||
|
# 1. 先提取并替换括号内容
|
|||
|
bracket_contents = []
|
|||
|
|
|||
|
def replace_bracket_content(match):
|
|||
|
bracket_contents.append(match.group(0)) # 保存括号内容
|
|||
|
return f"<BRACKET_{len(bracket_contents) - 1}>" # 使用占位符替换括号内容
|
|||
|
|
|||
|
item_with_placeholders = bracket_pattern.sub(replace_bracket_content, item)
|
|||
|
|
|||
|
# 2. 分割句子,保证句子完整性(按标点符号和序号分割)
|
|||
|
split_sentences = re.split(
|
|||
|
r'(?<=[。!?!?\?;])|' # 在中文句号、感叹号、问号或分号后面分割
|
|||
|
r'(?=\d+[..]\d+)|' # 在类似1.1的数字序号前分割
|
|||
|
r'(?=\d+\s(?![号条款节章项例页段部步点年月日时分秒]))|' # 数字后面跟空格且空格后面不是指定关键字时分割
|
|||
|
r'(?=\d+[、..])|' # 在数字后直接跟顿号、半角点号或全角点号时分割
|
|||
|
r'(?=[A-Za-z][..]\s*)|' # 在字母加点(如A.、a.)前分割
|
|||
|
r'(?=[A-Za-z]+\s*\d+\s*(?:[..]\s*\d+)*)|' # 在可选字母加数字或多级编号前分割
|
|||
|
r'(?=[一二三四五六七八九十]+、)', # 在中文数字加顿号(如一、二、)前分割
|
|||
|
item_with_placeholders
|
|||
|
)
|
|||
|
|
|||
|
# 3. 还原括号内容
|
|||
|
split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
|
|||
|
split_sentences]
|
|||
|
|
|||
|
# 接下来是处理包含和不包含后续关键词的情况
|
|||
|
i = 0
|
|||
|
while i < len(split_sentences):
|
|||
|
sentence = split_sentences[i].strip()
|
|||
|
if keywords_pattern.search(sentence):
|
|||
|
# 处理 "不\s*得" 的特殊情况,跳过包含 "不得分" 的句子
|
|||
|
not_allowed_match = re.search(r'不\s*得', sentence, re.IGNORECASE)
|
|||
|
if not_allowed_match:
|
|||
|
post_match_text = sentence[not_allowed_match.end():].strip()
|
|||
|
if post_match_text.startswith("分"):
|
|||
|
i += 1 # 跳过这个句子
|
|||
|
continue
|
|||
|
|
|||
|
# 检查是否存在后续关键词
|
|||
|
follow_up_present = any(fp.search(sentence) for fp in follow_up_patterns)
|
|||
|
if follow_up_present:
|
|||
|
# 如果存在后续关键词,则从当前位置开始截取
|
|||
|
start_index = i
|
|||
|
end_index = start_index
|
|||
|
found_next_section = False
|
|||
|
for j in range(start_index + 1, len(split_sentences)):
|
|||
|
if re.match(r'\d+[..]\d+([..]\d+)?', split_sentences[j].strip()):
|
|||
|
end_index = j
|
|||
|
found_next_section = True
|
|||
|
break
|
|||
|
if found_next_section:
|
|||
|
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
|||
|
else:
|
|||
|
full_text = ' '.join(split_sentences[start_index:]).strip()
|
|||
|
|
|||
|
# 清洗文本,去除前缀编号等
|
|||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z][..]\s*|[A-Za-z]?\d+\s*([..]\s*\d+)*(\s|[..]|、|.)?|[一二三四五六七八九十]+、)'
|
|||
|
full_text = re.sub(pattern, '', full_text).replace(' ', '').strip()
|
|||
|
sentences2.append(full_text) # 存储有后续关键词的情况
|
|||
|
i = end_index if found_next_section else len(split_sentences)
|
|||
|
else:
|
|||
|
# 没有后续关键词的情况
|
|||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z][..]\s*|[A-Za-z]?\d+\s*([..]\s*\d+)*(\s|[..]|、|.)?|[一二三四五六七八九十]+、)'
|
|||
|
cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ', '').strip()
|
|||
|
if len(cleaned_sentence) > 8:
|
|||
|
sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况
|
|||
|
i += 1
|
|||
|
else:
|
|||
|
i += 1
|
|||
|
return sentences1, sentences2 # 返回两个列表
|