zbparse/flask_app/general/投标人须知正文条款提取成json文件.py
2025-01-17 10:47:43 +08:00

494 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import re
import regex
from PyPDF2 import PdfReader
from flask_app.货物标.截取pdf货物标版 import clean_page_content, extract_common_header
def compare_headings(current, new):
"""
比较两个标题的层次关系并确保新标题比当前标题大且最高位数字差值不超过5。
参数:
current (str): 当前标题,例如 "1.2.3"
new (str): 新标题,例如 "1.3"
返回:
bool: 如果新标题大于当前标题且最高位数字差值不超过3则返回 True否则返回 False
"""
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 确保新标题的最高位数字不超过当前标题的最高位数字 + 3
if new_nums:
if len(new_nums) > 0 and len(current_nums) > 0:
if new_nums[0] > current_nums[0] + 3:
return False
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
if in_special_section:
current_content.append('\n')
return append_newline
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
current_content = []
append_newline = False
skip_subheadings = False
last_main_number = None
temp_title = None # 临时存储以点号开头但不带数字的标题
# 定义所有需要的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') # (一)
pattern_letter_initial = re.compile(r'^([A-Z])[..、]?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定',
'包括以下内容'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n')
def get_current_number(key_chinese):
chinese_to_number = {
'': 1, '': 2, '': 3, '': 4, '': 5,
'': 6, '': 7, '': 8, '': 9, '': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
}
if not key_chinese:
return -1
return chinese_to_number.get(key_chinese, -1)
def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
def is_heading(line):
# 检查是否匹配任何标题模式
patterns = [
r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', # 匹配 '12.1 内容'
r'^(\d+\.)\s*(.+)$', # 匹配 '12. 内容'
r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', # 匹配 '.12.1 内容'
r'^(\d+)([^.\d].*)' # 匹配 '27 内容'
]
for pattern in patterns:
if re.match(pattern, line):
return True
return False
# 预先扫描前5行检查是否有匹配任何标题模式 '一、总则' 这种,大标题夹着序号
first_five_lines = lines[:5]
has_initial_heading_patterns = False
for line in first_five_lines:
line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
line_stripped = line_stripped[2:] # Remove "##"
if (pattern_numbered.match(line_stripped) or pattern_parentheses.match(
line_stripped) or pattern_letter_initial.match(line_stripped)):
has_initial_heading_patterns = True
break
for i, line in enumerate(lines):
in_double_hash_mode = False
line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
in_double_hash_mode = True
line_stripped = line_stripped[2:] # Remove "##"
# 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配带数字的标题,例如 '12.1 内容'
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 检查是否进入或退出特殊章节
if is_heading(line_stripped):
if any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
# 以下是原有的匹配逻辑
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.、]\s*(\D.+)$', line_stripped)
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
pure_number_match = re.match(r'^(\d+)([^.\d)()号条款].*)', line_stripped) # 不允许出现右括号
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,').replace(' ', '')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
# current_key_chinese = None
# 处理临时标题与新主标题的关联
if temp_title:
# 提取主编号(第一个点前的数字)
main_number = new_key.split('.')[0]
# 关联临时标题到主编号
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 保存阿拉伯数字的标题内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif dot_match:
if in_double_hash_mode:
# 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.')
if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = True
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif dot_text_match:
if in_double_hash_mode:
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip()
if last_main_number:
# 生成下一个一级序号
try:
# 尝试将 last_main_number 转为整数并加1
next_main_number = str(int(last_main_number) + 1) + '.'
except ValueError:
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
# 这里假设 last_main_number 是阿拉伯数字
next_main_number = '未识别的序号.'
else:
# 如果没有上一个主编号,默认从 '1.' 开始
next_main_number = '1.'
# 更新 current_key 和 last_main_number
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = next_main_number
current_content = [temp_content] # 不直接添加 '\n'
last_main_number = next_main_number.rstrip('.')
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
continue # 跳过进一步处理该行
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif pure_number_match:
# 处理不带点号的纯数字开头的情况
new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,').replace(' ', '')
# 提取当前键名和新键名的数值部分
def extract_number(key):
return float(key.rstrip('.').split('.')[0])
current_key_num = extract_number(current_key) if current_key else 0
new_key_num = extract_number(new_key_candidate)
# 判断新键名是否为新的标题条件是键名递增且差值在5以内
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
# 处理临时标题与新主标题的关联
if temp_title:
main_number = new_key_candidate.split('.')[0]
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 开始新的标题
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key_candidate
current_content = [line_content]
append_newline = True
last_main_number = new_key_candidate.rstrip('.')
else:
# 将当前行视为当前标题的内容
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
else:
# 根据预先设置的标志决定是否执行这部分代码
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
numbered_match = pattern_numbered.match(line_stripped) # 一、
parentheses_match = pattern_parentheses.match(line_stripped) # (一)
if i < 5:
pattern_letter = pattern_letter_initial
else:
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$')
letter_match = pattern_letter.match(line_stripped) # A. 内容
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
# 判断当前行是否匹配了任何标题模式
if numbered_match or parentheses_match or letter_match:
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if initial_heading_pattern is None:
if numbered_match:
initial_heading_pattern = 'numbered'
elif parentheses_match:
initial_heading_pattern = 'parentheses'
elif letter_match:
initial_heading_pattern = 'letter'
# 确定当前匹配的标题模式
if numbered_match:
current_heading_pattern = 'numbered'
elif parentheses_match:
current_heading_pattern = 'parentheses'
elif letter_match:
current_heading_pattern = 'letter'
# 如果当前标题模式与初始标题模式一致,创建新的键值对
if current_heading_pattern == initial_heading_pattern:
new_key_chinese = None
new_value = None
# 保存之前的 key 的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None:
content_string = ''.join(current_content).strip()
# 如果已经有内容,添加换行符
if data.get(current_key):
data[current_key] = data.get(current_key) + '\n' + content_string
else:
data[current_key] = content_string
current_content = []
# 处理匹配到的标题
if current_heading_pattern == 'numbered':
new_key_chinese = numbered_match.group(1)
new_value = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses':
new_key_chinese = parentheses_match.group(1)
new_value = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter':
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
new_value = letter_value.lstrip('..、,').replace(' ', '')
# 比较数字大小
current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
new_number = get_current_number(new_key_chinese)
if new_number > current_number or current_key_chinese is None:
# 设置新的键值对
current_key_chinese = new_key_chinese
current_value_chinese = new_value
current_key = None
else:
# 如果新数字小于等于当前数字,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline,
keywords, in_special_section)
else:
# 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline,
keywords, in_special_section)
else:
# 未匹配到任何标题模式,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
else:
# 在特殊章节中,所有内容都作为当前标题的内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
# 最后保存最后一个 key 对应的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if temp_title:
# 处理任何未关联的临时标题
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
return data
def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2):
# start_pattern_2可能为0从匹配位置的下一行开开始加入而非匹配到的当前行应付'三、招投标须知正文'的情况
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
# 按行拆分文本
lines = cleaned_text.splitlines()
for idx, line in enumerate(lines):
# 只处理 start_pattern_1 和 start_pattern_2
start_match_1 = regex.search(start_pattern_1, line)
start_match_2 = None
if start_pattern_2:
start_match_2 = regex.search(start_pattern_2, line)
# 如果匹配第一个表达式
if start_match_1:
start_index = cleaned_text.find(line) # 找到匹配行的起始位置
cleaned_text = cleaned_text[start_index:] # 从匹配行开始截取
break
# 如果匹配第二个表达式,并且 start_pattern_2 不为 None
elif start_match_2:
start_index = cleaned_text.find(line) # 找到匹配行的起始位置
# 从匹配到的行的下一行开始截取
cleaned_text = cleaned_text[cleaned_text.find(lines[idx + 1]):] if idx + 1 < len(lines) else ""
break
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(regex.finditer(end_pattern, cleaned_text, regex.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
lines = cleaned_text.split('\n')
for j, line in enumerate(lines):
if j == 0:
lines[j] = '##' + line
cleaned_text = '\n'.join(lines)
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text
def convert_clause_to_json(file_path, output_folder, type=1):
if not os.path.exists(file_path):
print(f"The specified file does not exist: 返回空的clause_path")
return ""
if type == 2:
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$'
start_pattern_2 = None
end_pattern = (
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$' # 第二部分
)
else:
start_pattern_1 = (
r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
)
start_pattern_2 = (
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
)
end_pattern = (
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[:]'
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
# print(text)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
if __name__ == "__main__":
file_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444\招标文件111_tobidders_notice_part2.pdf'
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
output_folder = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444'
try:
output_path = convert_clause_to_json(file_path, output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)