zbparse/flask_app/general/投标人须知正文条款提取成json文件.py

337 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from PyPDF2 import PdfReader
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
if in_special_section:
current_content.append('\n')
return append_newline
import re
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
current_content = []
append_newline = False
skip_subheadings = False
last_main_number = None
temp_title = None # 临时存储以点号开头但不带数字的标题
# 定义所有需要的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') # (一)
pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$') # 主循环中严格匹配 A. 内容
pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n')
def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
def is_heading(line):
# 检查是否匹配任何标题模式
patterns = [
r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', # 匹配 '12.1 内容'
r'^(\d+\.)\s*(.+)$', # 匹配 '12. 内容'
r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', # 匹配 '.12.1 内容'
r'^(\d+)([^.\d].*)' # 匹配 '27 内容'
]
for pattern in patterns:
if re.match(pattern, line):
return True
return False
# 预先扫描前5行检查是否有匹配任何标题模式
first_five_lines = lines[:5]
has_initial_heading_patterns = False
for line in first_five_lines:
line_stripped = line.strip().replace('', '.')
if (pattern_numbered.match(line_stripped) or
pattern_parentheses.match(line_stripped) or
pattern_letter_initial.match(line_stripped) or
pattern_arabic.match(line_stripped)):
has_initial_heading_patterns = True
break
for i, line in enumerate(lines):
line_stripped = line.strip().replace('', '.')
# 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配带数字的标题,例如 '12.1 内容'
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 检查是否进入或退出特殊章节
if is_heading(line_stripped):
if any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
# 以下是原有的匹配逻辑
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.]\s*(\D.+)$', line_stripped)
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
pure_number_match = re.match(r'^(\d+)([^.\d)(].*)', line_stripped) # 不允许出现右括号
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,').replace(' ', '')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 处理临时标题与新主标题的关联
if temp_title:
# 提取主编号(第一个点前的数字)
main_number = new_key.split('.')[0]
# 关联临时标题到主编号
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 保存阿拉伯数字的标题内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
elif dot_match:
# 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.')
if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = True
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
elif dot_text_match:
# 处理以点号开头但不带数字的情况,存储到临时变量
temp_title = dot_text_match.group(1).strip()
continue # 跳过进一步处理该行
elif pure_number_match:
# 处理不带点号的纯数字开头的情况
new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,').replace(' ', '')
# 提取当前键名和新键名的数值部分
def extract_number(key):
return float(key.rstrip('.').split('.')[0])
current_key_num = extract_number(current_key) if current_key else 0
new_key_num = extract_number(new_key_candidate)
# 判断新键名是否为新的标题条件是键名递增且差值在5以内
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
# 处理临时标题与新主标题的关联
if temp_title:
main_number = new_key_candidate.split('.')[0]
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 开始新的标题
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key_candidate
current_content = [line_content]
append_newline = True
last_main_number = new_key_candidate.rstrip('.')
else:
# 将当前行视为当前标题的内容
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
# 根据预先设置的标志决定是否执行这部分代码
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
numbered_match = pattern_numbered.match(line_stripped)
parentheses_match = pattern_parentheses.match(line_stripped)
letter_match = pattern_letter.match(line_stripped)
arabic_match = pattern_arabic.match(line_stripped)
# 判断当前行是否匹配了任何标题模式
if numbered_match or parentheses_match or letter_match or arabic_match:
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if initial_heading_pattern is None:
if numbered_match:
initial_heading_pattern = 'numbered'
elif parentheses_match:
initial_heading_pattern = 'parentheses'
elif letter_match:
initial_heading_pattern = 'letter'
elif arabic_match:
initial_heading_pattern = 'arabic'
# 确定当前匹配的标题模式
if numbered_match:
current_heading_pattern = 'numbered'
elif parentheses_match:
current_heading_pattern = 'parentheses'
elif letter_match:
current_heading_pattern = 'letter'
elif arabic_match:
current_heading_pattern = 'arabic'
# 如果当前标题模式与初始标题模式一致,创建新的键值对
if current_heading_pattern == initial_heading_pattern:
# 保存之前的 key 的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 处理匹配到的标题
if current_heading_pattern == 'numbered':
current_key_chinese = numbered_match.group(1)
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses':
current_key_chinese = parentheses_match.group(1)
current_value_chinese = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter':
# 字母标题处理
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'arabic':
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value.replace(' ', '')]
append_newline = True
last_main_number = current_key.rstrip('.')
continue
else:
# 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
# 未匹配到任何标题模式,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
# 在特殊章节中,所有内容都作为当前标题的内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
# 最后保存最后一个 key 对应的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if temp_title:
# 处理任何未关联的临时标题
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
return data
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') #仅匹配第一页和最后一页不需要exclusion_pattern
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text