zbparse/flask_app/general/投标人须知正文条款提取成json文件.py
2024-12-30 10:48:03 +08:00

410 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from PyPDF2 import PdfReader
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
def compare_headings(current, new):
"""
比较两个标题的层次关系并确保新标题比当前标题大且最高位数字差值不超过5。
参数:
current (str): 当前标题,例如 "1.2.3"
new (str): 新标题,例如 "1.3"
返回:
bool: 如果新标题大于当前标题且最高位数字差值不超过3则返回 True否则返回 False
"""
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 确保新标题的最高位数字不超过当前标题的最高位数字 + 3
if new_nums:
if len(new_nums) > 0 and len(current_nums) > 0:
if new_nums[0] > current_nums[0] + 3:
return False
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
if in_special_section:
current_content.append('\n')
return append_newline
import re
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
current_content = []
append_newline = False
skip_subheadings = False
last_main_number = None
temp_title = None # 临时存储以点号开头但不带数字的标题
# 定义所有需要的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') # (一)
pattern_letter_initial = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 主循环中严格匹配 A. 内容
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n')
def get_current_number(key_chinese):
chinese_to_number = {
'': 1, '': 2, '': 3, '': 4, '': 5,
'': 6, '': 7, '': 8, '': 9, '': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
}
if not key_chinese:
return -1
return chinese_to_number.get(key_chinese, -1)
def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
def is_heading(line):
# 检查是否匹配任何标题模式
patterns = [
r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', # 匹配 '12.1 内容'
r'^(\d+\.)\s*(.+)$', # 匹配 '12. 内容'
r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', # 匹配 '.12.1 内容'
r'^(\d+)([^.\d].*)' # 匹配 '27 内容'
]
for pattern in patterns:
if re.match(pattern, line):
return True
return False
# 预先扫描前5行检查是否有匹配任何标题模式 '一、总则' 这种
first_five_lines = lines[:5]
has_initial_heading_patterns = False
for line in first_five_lines:
line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
line_stripped = line_stripped[2:] # Remove "##"
if (pattern_numbered.match(line_stripped) or
pattern_parentheses.match(line_stripped) or
pattern_letter_initial.match(line_stripped)
):
has_initial_heading_patterns = True
break
for i, line in enumerate(lines):
in_double_hash_mode = False
line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
in_double_hash_mode = True
line_stripped = line_stripped[2:] # Remove "##"
# 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配带数字的标题,例如 '12.1 内容'
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 检查是否进入或退出特殊章节
if is_heading(line_stripped):
if any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
# 以下是原有的匹配逻辑
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.、]\s*(\D.+)$', line_stripped)
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
pure_number_match = re.match(r'^(\d+)([^.\d)()号条款].*)', line_stripped) # 不允许出现右括号
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,').replace(' ', '')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
# current_key_chinese = None
# 处理临时标题与新主标题的关联
if temp_title:
# 提取主编号(第一个点前的数字)
main_number = new_key.split('.')[0]
# 关联临时标题到主编号
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 保存阿拉伯数字的标题内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
elif dot_match:
if in_double_hash_mode:
# 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.')
if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = True
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif dot_text_match:
if in_double_hash_mode:
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip()
if last_main_number:
# 生成下一个一级序号
try:
# 尝试将 last_main_number 转为整数并加1
next_main_number = str(int(last_main_number) + 1) + '.'
except ValueError:
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
# 这里假设 last_main_number 是阿拉伯数字
next_main_number = '未识别的序号.'
else:
# 如果没有上一个主编号,默认从 '1.' 开始
next_main_number = '1.'
# 更新 current_key 和 last_main_number
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = next_main_number
current_content = [temp_content] # 不直接添加 '\n'
last_main_number = next_main_number.rstrip('.')
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
continue # 跳过进一步处理该行
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif pure_number_match:
# 处理不带点号的纯数字开头的情况
new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,').replace(' ', '')
# 提取当前键名和新键名的数值部分
def extract_number(key):
return float(key.rstrip('.').split('.')[0])
current_key_num = extract_number(current_key) if current_key else 0
new_key_num = extract_number(new_key_candidate)
# 判断新键名是否为新的标题条件是键名递增且差值在5以内
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
# 处理临时标题与新主标题的关联
if temp_title:
main_number = new_key_candidate.split('.')[0]
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 开始新的标题
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key_candidate
current_content = [line_content]
append_newline = True
last_main_number = new_key_candidate.rstrip('.')
else:
# 将当前行视为当前标题的内容
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
# 根据预先设置的标志决定是否执行这部分代码
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
numbered_match = pattern_numbered.match(line_stripped) #一、
parentheses_match = pattern_parentheses.match(line_stripped) #(一)
letter_match = pattern_letter.match(line_stripped) #A. 内容
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
# 判断当前行是否匹配了任何标题模式
if numbered_match or parentheses_match or letter_match:
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if initial_heading_pattern is None:
if numbered_match:
initial_heading_pattern = 'numbered'
elif parentheses_match:
initial_heading_pattern = 'parentheses'
elif letter_match:
initial_heading_pattern = 'letter'
# 确定当前匹配的标题模式
if numbered_match:
current_heading_pattern = 'numbered'
elif parentheses_match:
current_heading_pattern = 'parentheses'
elif letter_match:
current_heading_pattern = 'letter'
# 如果当前标题模式与初始标题模式一致,创建新的键值对
if current_heading_pattern == initial_heading_pattern:
new_key_chinese = None
new_value = None
# 保存之前的 key 的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None:
content_string = ''.join(current_content).strip()
# 如果已经有内容,添加换行符
if data.get(current_key):
data[current_key] = data.get(current_key) + '\n' + content_string
else:
data[current_key] = content_string
current_content = []
# 处理匹配到的标题
if current_heading_pattern == 'numbered':
new_key_chinese = numbered_match.group(1)
new_value = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses':
new_key_chinese = parentheses_match.group(1)
new_value = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter':
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
new_value = letter_value.lstrip('..、,').replace(' ', '')
# 比较数字大小
current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
new_number = get_current_number(new_key_chinese)
if new_number > current_number or current_key_chinese is None:
# 设置新的键值对
current_key_chinese = new_key_chinese
current_value_chinese = new_value
current_key=None
else:
# 如果新数字小于等于当前数字,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline,
keywords, in_special_section)
else:
# 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
# 未匹配到任何标题模式,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
# 在特殊章节中,所有内容都作为当前标题的内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
# 最后保存最后一个 key 对应的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if temp_title:
# 处理任何未关联的临时标题
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
return data
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
lines = cleaned_text.split('\n')
for j, line in enumerate(lines):
if j == 0:
lines[j] = '##' + line
cleaned_text = '\n'.join(lines)
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text