zbparse/flask_app/general/投标人须知正文条款提取成json文件.py

513 lines
28 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import re
import regex
from PyPDF2 import PdfReader
from flask_app.general.截取pdf通用函数 import clean_page_content, extract_common_header
def compare_headings(current, new):
"""
比较两个标题的层次关系并确保新标题比当前标题大且最高位数字差值不超过5。
因为默认新的序号是比旧的序号大的,同时也要防止一些特别大的序号在行首,如'2025年xxx' 错误地将'2025'匹配成序号了,事实上它只是正文的一部分。
参数:
current (str): 当前标题,例如 "1.2.3"
new (str): 新标题,例如 "1.3"
返回:
bool: 如果新标题大于当前标题且最高位数字差值不超过3则返回 True否则返回 False
"""
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 确保新标题的最高位数字不超过当前标题的最高位数字 + 3
if new_nums:
if len(new_nums) > 0 and len(current_nums) > 0:
if new_nums[0] > current_nums[0] + 3:
return False
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
#如果拼接后的字符串 content_str 中包含任何一个在 keywords 中的关键词,或者 content_str 的长度小于或等于 max_length则返回 True表示应该添加换行符。
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
if append_newline:
if should_add_newline(current_content, keywords): #current_content:['费用承担'] line_content:'投标人准备和参加投标活动发生的费用自理。'
current_content.append('\n') # 添加换行符 #current_content:['费用承担', '\n']
append_newline = False
current_content.append(line_content) #current_content:['费用承担', '\n', '投标人准备和参加投标活动发生的费用自理。']
if in_special_section:
current_content.append('\n')
return append_newline
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
current_content = []
append_newline = False
skip_subheadings = False
last_main_number = None
temp_title = None # 临时存储以点号开头但不带数字的标题
# 定义所有需要的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') # (一)
pattern_letter_initial = re.compile(r'^([A-Z])[..、]?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
initial_heading_pattern = None
special_section_keywords = [r'文件.*(组成|构成|包括|包含)'] #注意,只考虑对文件组成/构成换行,因为一般一行能展示的下,如“投标函”;暂不考虑'下列情形',因为添加换行会导致语义的不连贯。
in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n')
def get_current_number(key_chinese):
chinese_to_number = { #便于比较中文序号大小
'': 1, '': 2, '': 3, '': 4, '': 5,
'': 6, '': 7, '': 8, '': 9, '': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
}
if not key_chinese:
return -1
return chinese_to_number.get(key_chinese, -1)
def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
def is_heading(line):
# 检查是否匹配任何标题模式
patterns = [
r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', # 匹配 '12.1 内容'
r'^(\d+\.)\s*(.+)$', # 匹配 '12. 内容'
r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', # 匹配 '.12.1 内容' 这种点号开头的情况是因为读取pdf时clean_page_content进行了清理删去了页眉页脚和页码序号可能会被误判为页码删除这种情况已在代码中进行了不错的处理。
r'^(\d+)([^.\d].*)' # 匹配 '27 内容'
]
for pattern in patterns:
if re.match(pattern, line):
return True
return False
# 预先扫描前5行检查是否有匹配任何标题模式 '一、总则' 这种,大标题夹着序号
first_five_lines = lines[:5]
has_initial_heading_patterns = False
for line in first_five_lines:
line_stripped = line.strip().replace('', '.') #line_stripped是处理中的当前行
if line_stripped.startswith("##"): #预处理时添加的对每页pdf的首行前打了标记'##' 便于特殊处理因为首行的序号往往会因为clean_page_content被错误删除需补全该序号。
line_stripped = line_stripped[2:] # 移除 "##"
if (pattern_numbered.match(line_stripped) or pattern_parentheses.match(
line_stripped) or pattern_letter_initial.match(line_stripped)):
has_initial_heading_patterns = True
break
for i, line in enumerate(lines):
in_double_hash_mode = False #标志是否位于页码第一行
line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
in_double_hash_mode = True
line_stripped = line_stripped[2:] # Remove "##"
# 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配带数字的标题,例如 '12.1 内容'
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 检查是否进入或退出特殊章节: 防止如'5、 竞争性磋商采购文件的构成'后面若干行出现'一、招标公告''二、投标人须知''七、 xxxx',这种情况,导致程序错误地匹配大标题,但是它们应该视为'5、 竞争性磋商采购文件的构成'后的正文部分!
#进入特殊章节:处理到的行都视为当前序号的内容,而不作为新的序号
if is_heading(line_stripped):
if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.、]\s*(\D.+)$', line_stripped)
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
pure_number_match = re.match(r'^(\d+)([^.\d)()号条款节章项例页段部步点年月日时分秒个元千万台份家].*)', line_stripped) # [^...]:表示匹配不在 [...] 中的字符集合(即排除这些字符)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,').replace(' ', '')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
# current_key_chinese = None
# 处理临时标题与新主标题的关联
if temp_title:
# 提取主编号(第一个点前的数字)
main_number = new_key.split('.')[0]
# 关联临时标题到主编号
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
last_char = current_content[-1][-1] if current_content and current_content[-1] else None
if current_key is None or (compare_headings(current_key, new_key) and (
not current_content or last_char != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = (
len(new_key.rstrip('.').split('.')) <= 2 or
any(re.search(pattern, line_content) for pattern in special_section_keywords)
)
last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(
current_content, line_stripped, append_newline, keywords, in_special_section
)
elif dot_match:
if in_double_hash_mode:
# 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.')
# 检查是否进入或退出特殊章节
if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = True
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif dot_text_match:
if in_double_hash_mode:
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip()
# 检查是否进入或退出特殊章节
if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
if last_main_number:
# 生成下一个一级序号
try:
# 尝试将 last_main_number 转为整数并加1
next_main_number = str(int(last_main_number) + 1) + '.'
except ValueError:
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
# 这里假设 last_main_number 是阿拉伯数字
next_main_number = '未识别的序号.'
else:
# 如果没有上一个主编号,默认从 '1.' 开始
next_main_number = '1.'
# 更新 current_key 和 last_main_number
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = next_main_number
current_content = [temp_content] # 不直接添加 '\n'
last_main_number = next_main_number.rstrip('.')
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
continue # 跳过进一步处理该行
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif pure_number_match:
# 处理不带点号的纯数字开头的情况
new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,').replace(' ', '')
# 提取当前键名和新键名的数值部分
def extract_number(key):
return float(key.rstrip('.').split('.')[0])
current_key_num = extract_number(current_key) if current_key else 0
new_key_num = extract_number(new_key_candidate)
# 判断新键名是否为新的标题条件是键名递增且差值在5以内
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
# 处理临时标题与新主标题的关联
if temp_title:
main_number = new_key_candidate.split('.')[0]
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 开始新的标题
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key_candidate
current_content = [line_content]
append_newline = True
last_main_number = new_key_candidate.rstrip('.')
else:
# 将当前行视为当前标题的内容
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
else:
#前面的都没匹配上,那么当前处理行可能是大标题'一、总则' 或者是普通正文部分'应通知采购代理机构补全或更换,否则风险自负。'
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
# 匹配上任意一种大标题形式:'一、xx' '、xx' 'A.xx' 且不处于'特殊章节' ,继续执行后面的代码
numbered_match = pattern_numbered.match(line_stripped) # 一、
parentheses_match = pattern_parentheses.match(line_stripped) # (一)
if i < 5:
pattern_letter = pattern_letter_initial
else:
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$')
letter_match = pattern_letter.match(line_stripped) # A. 内容
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
# 判断当前行是否匹配了任何标题模式
if numbered_match or parentheses_match or letter_match:
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if initial_heading_pattern is None:
if numbered_match:
initial_heading_pattern = 'numbered'
elif parentheses_match:
initial_heading_pattern = 'parentheses'
elif letter_match:
initial_heading_pattern = 'letter'
# 确定当前匹配的标题模式
if numbered_match:
current_heading_pattern = 'numbered'
elif parentheses_match:
current_heading_pattern = 'parentheses'
elif letter_match:
current_heading_pattern = 'letter'
# 如果当前标题模式与初始标题模式一致,创建新的键值对
if current_heading_pattern == initial_heading_pattern:
new_key_chinese = None
new_value = None
# 保存之前的 key 的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None:
content_string = ''.join(current_content).strip()
# 如果已经有内容,添加换行符
if data.get(current_key):
data[current_key] = data.get(current_key) + '\n' + content_string
else:
data[current_key] = content_string
current_content = []
# 处理匹配到的标题
if current_heading_pattern == 'numbered':
new_key_chinese = numbered_match.group(1)
new_value = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses':
new_key_chinese = parentheses_match.group(1)
new_value = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter':
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
new_value = letter_value.lstrip('..、,').replace(' ', '')
# 比较数字大小
current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
new_number = get_current_number(new_key_chinese)
if new_number > current_number or current_key_chinese is None:
# 设置新的键值对
current_key_chinese = new_key_chinese
current_value_chinese = new_value
current_key = None
else:
# 如果新数字小于等于当前数字,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline,
keywords, in_special_section)
else:
# 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline,
keywords, in_special_section)
else:
# 未匹配到任何标题模式,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
else:
#该情况下所有内容都作为当前标题的内容添加到current_content中
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
# 最后保存最后一个 key 对应的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if temp_title:
# 处理任何未关联的临时标题
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
return data
def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2):
# start_pattern_2可能为0从匹配位置的下一行开开始加入而非匹配到的当前行应付'三、招投标须知正文'的情况
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
# 按行拆分文本
lines = cleaned_text.splitlines()
for idx, line in enumerate(lines):
# 只处理 start_pattern_1 和 start_pattern_2
start_match_1 = regex.search(start_pattern_1, line)
start_match_2 = None
if start_pattern_2:
start_match_2 = regex.search(start_pattern_2, line)
# 如果匹配第一个表达式
if start_match_1:
start_index = cleaned_text.find(line) # 找到匹配行的起始位置
cleaned_text = cleaned_text[start_index:] # 从匹配行开始截取
break
# 如果匹配第二个表达式,并且 start_pattern_2 不为 None
elif start_match_2:
start_index = cleaned_text.find(line) # 找到匹配行的起始位置
# 从匹配到的行的下一行开始截取
cleaned_text = cleaned_text[cleaned_text.find(lines[idx + 1]):] if idx + 1 < len(lines) else ""
break
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(regex.finditer(end_pattern, cleaned_text, regex.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
lines = cleaned_text.split('\n')
for j, line in enumerate(lines):
if j == 0:
lines[j] = '##' + line
cleaned_text = '\n'.join(lines)
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text
def convert_clause_to_json(file_path, output_folder, type=1):
try:
if not os.path.exists(file_path):
print(f"The specified file does not exist: 返回空的clause_path")
return ""
if type == 2:
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$'
start_pattern_2 = None
end_pattern = (
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$' # 第二部分
)
else:
start_pattern_1 = (
r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
)
start_pattern_2 = (
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
)
end_pattern = (
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[:]'
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
# print(text)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
except Exception as e:
print(f"Error in convert_clause_to_json: {e}")
return ""
if __name__ == "__main__":
file_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\test\泉州白濑水利枢纽工程高低压配电装置设计、制造及采购 (1)_tobidders_notice_part2.pdf'
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
output_folder = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\test'
try:
output_path = convert_clause_to_json(file_path, output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)