2025-01-14 17:10:38 +08:00
|
|
|
|
import json
|
|
|
|
|
import os
|
2024-09-28 17:38:53 +08:00
|
|
|
|
import re
|
2025-01-14 17:10:38 +08:00
|
|
|
|
import regex
|
2024-09-28 17:38:53 +08:00
|
|
|
|
from PyPDF2 import PdfReader
|
2024-10-09 13:50:28 +08:00
|
|
|
|
|
2025-01-17 10:47:43 +08:00
|
|
|
|
from flask_app.货物标.截取pdf货物标版 import clean_page_content, extract_common_header
|
|
|
|
|
|
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
def compare_headings(current, new):
|
2024-12-25 10:16:25 +08:00
|
|
|
|
"""
|
|
|
|
|
比较两个标题的层次关系,并确保新标题比当前标题大且最高位数字差值不超过5。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
current (str): 当前标题,例如 "1.2.3"
|
|
|
|
|
new (str): 新标题,例如 "1.3"
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 如果新标题大于当前标题且最高位数字差值不超过3,则返回 True,否则返回 False
|
|
|
|
|
"""
|
2024-09-28 17:38:53 +08:00
|
|
|
|
# 使用过滤来确保只处理非空且为数字的部分
|
|
|
|
|
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
|
|
|
|
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
|
|
|
|
|
|
2024-12-25 10:16:25 +08:00
|
|
|
|
# 确保新标题的最高位数字不超过当前标题的最高位数字 + 3
|
|
|
|
|
if new_nums:
|
|
|
|
|
if len(new_nums) > 0 and len(current_nums) > 0:
|
|
|
|
|
if new_nums[0] > current_nums[0] + 3:
|
|
|
|
|
return False
|
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
# 比较数字序列以确定标题的层次关系
|
|
|
|
|
for c, n in zip(current_nums, new_nums):
|
|
|
|
|
if n > c:
|
|
|
|
|
return True
|
|
|
|
|
elif n < c:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 如果新标题有更多层次,认为是新的子章节
|
|
|
|
|
return len(new_nums) > len(current_nums)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def should_add_newline(content, keywords, max_length=20):
|
|
|
|
|
content_str = ''.join(content).strip()
|
|
|
|
|
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
|
|
|
|
|
2025-01-17 10:47:43 +08:00
|
|
|
|
|
|
|
|
|
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if append_newline:
|
|
|
|
|
if should_add_newline(current_content, keywords):
|
|
|
|
|
current_content.append('\n') # 添加换行符
|
|
|
|
|
append_newline = False
|
|
|
|
|
current_content.append(line_content)
|
2024-11-06 17:29:45 +08:00
|
|
|
|
if in_special_section:
|
|
|
|
|
current_content.append('\n')
|
2024-09-28 17:38:53 +08:00
|
|
|
|
return append_newline
|
|
|
|
|
|
2025-01-17 10:47:43 +08:00
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
def parse_text_by_heading(text):
|
|
|
|
|
keywords = ['包含', '以下']
|
|
|
|
|
data = {}
|
|
|
|
|
current_key = None
|
2024-10-09 17:07:55 +08:00
|
|
|
|
current_key_chinese = None
|
|
|
|
|
current_value_chinese = None
|
2024-09-28 17:38:53 +08:00
|
|
|
|
current_content = []
|
|
|
|
|
append_newline = False
|
2024-10-09 13:50:28 +08:00
|
|
|
|
skip_subheadings = False
|
2024-10-09 20:51:33 +08:00
|
|
|
|
last_main_number = None
|
2024-11-04 17:13:06 +08:00
|
|
|
|
temp_title = None # 临时存储以点号开头但不带数字的标题
|
2024-11-13 17:24:17 +08:00
|
|
|
|
|
|
|
|
|
# 定义所有需要的正则表达式模式
|
2025-01-17 10:47:43 +08:00
|
|
|
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
|
|
|
|
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') # (一)
|
|
|
|
|
pattern_letter_initial = re.compile(r'^([A-Z])[..、]?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
|
2024-12-09 17:38:01 +08:00
|
|
|
|
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
|
2024-11-13 17:24:17 +08:00
|
|
|
|
|
2024-11-06 17:29:45 +08:00
|
|
|
|
initial_heading_pattern = None
|
2025-01-17 10:47:43 +08:00
|
|
|
|
special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定',
|
|
|
|
|
'包括以下内容'] # 定义特殊章节关键词
|
2024-11-06 17:29:45 +08:00
|
|
|
|
in_special_section = False # 标志是否在特殊章节中
|
2024-10-09 13:50:28 +08:00
|
|
|
|
lines = text.split('\n')
|
2024-11-04 17:13:06 +08:00
|
|
|
|
|
2024-12-09 17:38:01 +08:00
|
|
|
|
def get_current_number(key_chinese):
|
|
|
|
|
chinese_to_number = {
|
|
|
|
|
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
|
|
|
|
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
|
|
|
|
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
|
|
|
|
|
}
|
|
|
|
|
if not key_chinese:
|
|
|
|
|
return -1
|
|
|
|
|
return chinese_to_number.get(key_chinese, -1)
|
|
|
|
|
|
2024-11-04 17:13:06 +08:00
|
|
|
|
def check_year_pattern(line):
|
|
|
|
|
# 检查是否为年份模式,如 '2014年'
|
|
|
|
|
line_no_spaces = line.replace(' ', '')
|
|
|
|
|
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
|
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
def is_heading(line):
|
|
|
|
|
# 检查是否匹配任何标题模式
|
|
|
|
|
patterns = [
|
|
|
|
|
r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', # 匹配 '12.1 内容'
|
2025-01-17 10:47:43 +08:00
|
|
|
|
r'^(\d+\.)\s*(.+)$', # 匹配 '12. 内容'
|
|
|
|
|
r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', # 匹配 '.12.1 内容'
|
|
|
|
|
r'^(\d+)([^.\d].*)' # 匹配 '27 内容'
|
2024-11-07 15:46:24 +08:00
|
|
|
|
]
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
if re.match(pattern, line):
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
2025-01-17 10:47:43 +08:00
|
|
|
|
# 预先扫描前5行,检查是否有匹配任何标题模式 '一、总则' 这种,大标题夹着序号
|
2024-11-13 17:24:17 +08:00
|
|
|
|
first_five_lines = lines[:5]
|
|
|
|
|
has_initial_heading_patterns = False
|
|
|
|
|
for line in first_five_lines:
|
|
|
|
|
line_stripped = line.strip().replace('.', '.')
|
2024-12-09 17:38:01 +08:00
|
|
|
|
if line_stripped.startswith("##"):
|
|
|
|
|
line_stripped = line_stripped[2:] # Remove "##"
|
2025-01-17 10:47:43 +08:00
|
|
|
|
if (pattern_numbered.match(line_stripped) or pattern_parentheses.match(
|
|
|
|
|
line_stripped) or pattern_letter_initial.match(line_stripped)):
|
2024-11-13 17:24:17 +08:00
|
|
|
|
has_initial_heading_patterns = True
|
|
|
|
|
break
|
|
|
|
|
|
2024-10-09 13:50:28 +08:00
|
|
|
|
for i, line in enumerate(lines):
|
2024-12-09 17:38:01 +08:00
|
|
|
|
in_double_hash_mode = False
|
2024-10-09 13:50:28 +08:00
|
|
|
|
line_stripped = line.strip().replace('.', '.')
|
2024-12-09 17:38:01 +08:00
|
|
|
|
if line_stripped.startswith("##"):
|
|
|
|
|
in_double_hash_mode = True
|
|
|
|
|
line_stripped = line_stripped[2:] # Remove "##"
|
2024-11-04 17:13:06 +08:00
|
|
|
|
# 如果在一个章节内,跳过年份模式的行
|
|
|
|
|
if check_year_pattern(line_stripped) and current_key is not None:
|
|
|
|
|
current_content.append(line_stripped)
|
|
|
|
|
continue
|
|
|
|
|
|
2024-12-30 10:11:27 +08:00
|
|
|
|
# 匹配带数字的标题,例如 '12.1 内容'
|
2024-12-30 10:48:03 +08:00
|
|
|
|
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
|
|
|
|
if not match:
|
|
|
|
|
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 检查是否进入或退出特殊章节
|
|
|
|
|
if is_heading(line_stripped):
|
|
|
|
|
if any(keyword in line_stripped for keyword in special_section_keywords):
|
|
|
|
|
in_special_section = True
|
|
|
|
|
elif in_special_section:
|
2024-11-06 17:29:45 +08:00
|
|
|
|
in_special_section = False
|
|
|
|
|
|
|
|
|
|
# 以下是原有的匹配逻辑
|
2024-11-04 17:13:06 +08:00
|
|
|
|
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
|
|
|
|
|
dot_match = re.match(r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', line_stripped)
|
|
|
|
|
|
|
|
|
|
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
|
2024-12-09 17:38:01 +08:00
|
|
|
|
dot_text_match = re.match(r'^[..、]\s*(\D.+)$', line_stripped)
|
2024-10-09 20:51:33 +08:00
|
|
|
|
|
2024-11-04 17:13:06 +08:00
|
|
|
|
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
|
2024-12-09 17:38:01 +08:00
|
|
|
|
pure_number_match = re.match(r'^(\d+)([^.\d)()号条款].*)', line_stripped) # 不允许出现右括号
|
2024-10-22 21:02:54 +08:00
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if match:
|
2024-10-09 17:07:55 +08:00
|
|
|
|
new_key, line_content = match.groups()
|
2024-11-13 17:24:17 +08:00
|
|
|
|
line_content = line_content.lstrip('..、,').replace(' ', '')
|
2024-10-09 13:50:28 +08:00
|
|
|
|
|
|
|
|
|
if any(keyword in line_content for keyword in keywords):
|
|
|
|
|
skip_subheadings = True
|
|
|
|
|
else:
|
|
|
|
|
skip_subheadings = False
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 保存中文标题的内容
|
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
2024-12-09 17:38:01 +08:00
|
|
|
|
# current_key_chinese = None
|
2024-10-09 17:07:55 +08:00
|
|
|
|
|
2024-11-04 17:13:06 +08:00
|
|
|
|
# 处理临时标题与新主标题的关联
|
|
|
|
|
if temp_title:
|
|
|
|
|
# 提取主编号(第一个点前的数字)
|
|
|
|
|
main_number = new_key.split('.')[0]
|
|
|
|
|
# 关联临时标题到主编号
|
|
|
|
|
data[f"{main_number}."] = temp_title
|
|
|
|
|
temp_title = None # 重置临时标题
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 保存阿拉伯数字的标题内容
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if current_key is None or (compare_headings(current_key, new_key) and (
|
|
|
|
|
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-11-13 17:24:17 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string
|
2024-09-28 17:38:53 +08:00
|
|
|
|
current_key = new_key
|
|
|
|
|
current_content = [line_content]
|
2024-09-29 16:55:50 +08:00
|
|
|
|
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
2024-10-09 20:51:33 +08:00
|
|
|
|
last_main_number = new_key.split('.')[0]
|
|
|
|
|
else:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
2024-11-04 17:13:06 +08:00
|
|
|
|
|
2024-10-09 20:51:33 +08:00
|
|
|
|
elif dot_match:
|
2024-12-09 17:38:01 +08:00
|
|
|
|
if in_double_hash_mode:
|
|
|
|
|
# 处理以点号开头并带有数字的情况
|
|
|
|
|
sub_number, line_content = dot_match.groups()
|
|
|
|
|
sub_number = sub_number.replace('.', '.')
|
|
|
|
|
if last_main_number:
|
|
|
|
|
new_key = f"{last_main_number}.{sub_number}"
|
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string
|
|
|
|
|
current_key = new_key
|
|
|
|
|
current_content = [line_content]
|
|
|
|
|
append_newline = True
|
|
|
|
|
else:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
2024-12-09 17:38:01 +08:00
|
|
|
|
else:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
|
|
|
|
elif dot_text_match:
|
|
|
|
|
if in_double_hash_mode:
|
|
|
|
|
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
|
|
|
|
|
temp_content = dot_text_match.group(1).strip()
|
|
|
|
|
if last_main_number:
|
|
|
|
|
# 生成下一个一级序号
|
|
|
|
|
try:
|
|
|
|
|
# 尝试将 last_main_number 转为整数并加1
|
|
|
|
|
next_main_number = str(int(last_main_number) + 1) + '.'
|
|
|
|
|
except ValueError:
|
|
|
|
|
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
|
|
|
|
|
# 这里假设 last_main_number 是阿拉伯数字
|
|
|
|
|
next_main_number = '未识别的序号.'
|
|
|
|
|
else:
|
|
|
|
|
# 如果没有上一个主编号,默认从 '1.' 开始
|
|
|
|
|
next_main_number = '1.'
|
|
|
|
|
# 更新 current_key 和 last_main_number
|
2024-10-09 20:51:33 +08:00
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-11-13 17:24:17 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string
|
2024-12-09 17:38:01 +08:00
|
|
|
|
current_key = next_main_number
|
|
|
|
|
current_content = [temp_content] # 不直接添加 '\n'
|
|
|
|
|
last_main_number = next_main_number.rstrip('.')
|
|
|
|
|
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
|
2024-11-04 17:13:06 +08:00
|
|
|
|
|
2024-12-09 17:38:01 +08:00
|
|
|
|
continue # 跳过进一步处理该行
|
|
|
|
|
else:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
2024-11-06 17:29:45 +08:00
|
|
|
|
elif pure_number_match:
|
|
|
|
|
# 处理不带点号的纯数字开头的情况
|
2024-10-22 21:02:54 +08:00
|
|
|
|
new_key_candidate, line_content = pure_number_match.groups()
|
|
|
|
|
new_key_candidate += '.' # 添加点号
|
2024-11-13 17:24:17 +08:00
|
|
|
|
line_content = line_content.lstrip('..、,').replace(' ', '')
|
2024-10-22 21:02:54 +08:00
|
|
|
|
|
|
|
|
|
# 提取当前键名和新键名的数值部分
|
|
|
|
|
def extract_number(key):
|
|
|
|
|
return float(key.rstrip('.').split('.')[0])
|
|
|
|
|
|
|
|
|
|
current_key_num = extract_number(current_key) if current_key else 0
|
|
|
|
|
new_key_num = extract_number(new_key_candidate)
|
|
|
|
|
|
|
|
|
|
# 判断新键名是否为新的标题,条件是键名递增且差值在5以内
|
|
|
|
|
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
|
2024-11-04 17:13:06 +08:00
|
|
|
|
# 处理临时标题与新主标题的关联
|
|
|
|
|
if temp_title:
|
|
|
|
|
main_number = new_key_candidate.split('.')[0]
|
|
|
|
|
data[f"{main_number}."] = temp_title
|
|
|
|
|
temp_title = None # 重置临时标题
|
|
|
|
|
|
2024-10-22 21:02:54 +08:00
|
|
|
|
# 开始新的标题
|
2024-12-09 17:38:01 +08:00
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
2024-10-22 21:02:54 +08:00
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-11-13 17:24:17 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string
|
2024-10-22 21:02:54 +08:00
|
|
|
|
current_key = new_key_candidate
|
|
|
|
|
current_content = [line_content]
|
|
|
|
|
append_newline = True
|
|
|
|
|
last_main_number = new_key_candidate.rstrip('.')
|
|
|
|
|
else:
|
|
|
|
|
# 将当前行视为当前标题的内容
|
2025-01-17 10:47:43 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
2024-10-22 21:02:54 +08:00
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
else:
|
2024-11-13 17:24:17 +08:00
|
|
|
|
# 根据预先设置的标志决定是否执行这部分代码
|
|
|
|
|
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
numbered_match = pattern_numbered.match(line_stripped) # 一、
|
|
|
|
|
parentheses_match = pattern_parentheses.match(line_stripped) # (一)
|
|
|
|
|
if i < 5:
|
|
|
|
|
pattern_letter = pattern_letter_initial
|
|
|
|
|
else:
|
|
|
|
|
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$')
|
|
|
|
|
letter_match = pattern_letter.match(line_stripped) # A. 内容
|
2024-12-09 17:38:01 +08:00
|
|
|
|
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
|
2024-10-09 13:50:28 +08:00
|
|
|
|
|
2024-11-06 17:29:45 +08:00
|
|
|
|
# 判断当前行是否匹配了任何标题模式
|
2024-12-09 17:38:01 +08:00
|
|
|
|
if numbered_match or parentheses_match or letter_match:
|
2024-11-06 17:29:45 +08:00
|
|
|
|
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
|
|
|
|
|
if initial_heading_pattern is None:
|
|
|
|
|
if numbered_match:
|
|
|
|
|
initial_heading_pattern = 'numbered'
|
2024-11-13 17:24:17 +08:00
|
|
|
|
elif parentheses_match:
|
2024-11-06 17:29:45 +08:00
|
|
|
|
initial_heading_pattern = 'parentheses'
|
|
|
|
|
elif letter_match:
|
|
|
|
|
initial_heading_pattern = 'letter'
|
2024-12-09 17:38:01 +08:00
|
|
|
|
|
2024-11-06 17:29:45 +08:00
|
|
|
|
# 确定当前匹配的标题模式
|
|
|
|
|
if numbered_match:
|
|
|
|
|
current_heading_pattern = 'numbered'
|
2024-11-13 17:24:17 +08:00
|
|
|
|
elif parentheses_match:
|
2024-11-06 17:29:45 +08:00
|
|
|
|
current_heading_pattern = 'parentheses'
|
2024-10-09 13:50:28 +08:00
|
|
|
|
elif letter_match:
|
2024-11-06 17:29:45 +08:00
|
|
|
|
current_heading_pattern = 'letter'
|
2024-12-09 17:38:01 +08:00
|
|
|
|
|
2024-11-06 17:29:45 +08:00
|
|
|
|
# 如果当前标题模式与初始标题模式一致,创建新的键值对
|
|
|
|
|
if current_heading_pattern == initial_heading_pattern:
|
2024-12-09 17:38:01 +08:00
|
|
|
|
new_key_chinese = None
|
|
|
|
|
new_value = None
|
2024-11-06 17:29:45 +08:00
|
|
|
|
# 保存之前的 key 的内容
|
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
2024-12-09 17:38:01 +08:00
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
|
|
|
|
# 如果已经有内容,添加换行符
|
|
|
|
|
if data.get(current_key):
|
|
|
|
|
data[current_key] = data.get(current_key) + '\n' + content_string
|
|
|
|
|
else:
|
|
|
|
|
data[current_key] = content_string
|
|
|
|
|
current_content = []
|
2024-11-06 17:29:45 +08:00
|
|
|
|
# 处理匹配到的标题
|
|
|
|
|
if current_heading_pattern == 'numbered':
|
2024-12-09 17:38:01 +08:00
|
|
|
|
new_key_chinese = numbered_match.group(1)
|
|
|
|
|
new_value = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
|
2024-11-06 17:29:45 +08:00
|
|
|
|
elif current_heading_pattern == 'parentheses':
|
2024-12-09 17:38:01 +08:00
|
|
|
|
new_key_chinese = parentheses_match.group(1)
|
|
|
|
|
new_value = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
|
2024-11-06 17:29:45 +08:00
|
|
|
|
elif current_heading_pattern == 'letter':
|
|
|
|
|
letter_key, letter_value = letter_match.groups()
|
|
|
|
|
letter_to_chinese = {
|
|
|
|
|
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
|
|
|
|
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
|
|
|
|
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
|
|
|
|
}
|
2024-12-09 17:38:01 +08:00
|
|
|
|
new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
|
|
|
|
|
new_value = letter_value.lstrip('..、,').replace(' ', '')
|
|
|
|
|
# 比较数字大小
|
|
|
|
|
current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
|
|
|
|
|
new_number = get_current_number(new_key_chinese)
|
|
|
|
|
if new_number > current_number or current_key_chinese is None:
|
|
|
|
|
# 设置新的键值对
|
|
|
|
|
current_key_chinese = new_key_chinese
|
|
|
|
|
current_value_chinese = new_value
|
2025-01-17 10:47:43 +08:00
|
|
|
|
current_key = None
|
2024-12-09 17:38:01 +08:00
|
|
|
|
else:
|
|
|
|
|
# 如果新数字小于等于当前数字,将该行视为内容
|
|
|
|
|
if line_stripped:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline,
|
|
|
|
|
keywords, in_special_section)
|
2024-11-06 17:29:45 +08:00
|
|
|
|
else:
|
|
|
|
|
# 当前标题模式与初始模式不一致,将该行视为内容
|
|
|
|
|
if line_stripped:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline,
|
|
|
|
|
keywords, in_special_section)
|
2024-11-06 17:29:45 +08:00
|
|
|
|
else:
|
|
|
|
|
# 未匹配到任何标题模式,将该行视为内容
|
|
|
|
|
if line_stripped:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
2024-11-06 17:29:45 +08:00
|
|
|
|
else:
|
|
|
|
|
# 在特殊章节中,所有内容都作为当前标题的内容
|
|
|
|
|
if line_stripped:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
|
|
|
|
in_special_section)
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 最后保存最后一个 key 对应的内容
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-11-13 17:24:17 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string
|
2024-10-09 17:07:55 +08:00
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
2024-11-04 17:13:06 +08:00
|
|
|
|
if temp_title:
|
|
|
|
|
# 处理任何未关联的临时标题
|
|
|
|
|
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
2024-11-13 17:24:17 +08:00
|
|
|
|
|
2025-01-16 17:22:49 +08:00
|
|
|
|
def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2):
|
2025-01-17 10:47:43 +08:00
|
|
|
|
# start_pattern_2可能为0,从匹配位置的下一行开开始加入,而非匹配到的当前行,应付'三、招投标须知正文'的情况
|
2024-11-11 17:12:38 +08:00
|
|
|
|
# 从PDF文件中提取文本
|
|
|
|
|
common_header = extract_common_header(file_path)
|
|
|
|
|
pdf_document = PdfReader(file_path)
|
|
|
|
|
all_pages_text = []
|
|
|
|
|
start_index = None
|
|
|
|
|
# 处理所有页面
|
|
|
|
|
for i, page in enumerate(pdf_document.pages):
|
|
|
|
|
page_text = page.extract_text() if page.extract_text() else ""
|
|
|
|
|
cleaned_text = clean_page_content(page_text, common_header)
|
|
|
|
|
# 在第一页查找开始位置
|
|
|
|
|
if i == 0 and start_index is None:
|
2025-01-16 17:22:49 +08:00
|
|
|
|
# 按行拆分文本
|
|
|
|
|
lines = cleaned_text.splitlines()
|
|
|
|
|
for idx, line in enumerate(lines):
|
|
|
|
|
# 只处理 start_pattern_1 和 start_pattern_2
|
|
|
|
|
start_match_1 = regex.search(start_pattern_1, line)
|
|
|
|
|
start_match_2 = None
|
|
|
|
|
if start_pattern_2:
|
|
|
|
|
start_match_2 = regex.search(start_pattern_2, line)
|
|
|
|
|
# 如果匹配第一个表达式
|
|
|
|
|
if start_match_1:
|
|
|
|
|
start_index = cleaned_text.find(line) # 找到匹配行的起始位置
|
|
|
|
|
cleaned_text = cleaned_text[start_index:] # 从匹配行开始截取
|
|
|
|
|
break
|
|
|
|
|
# 如果匹配第二个表达式,并且 start_pattern_2 不为 None
|
|
|
|
|
elif start_match_2:
|
|
|
|
|
start_index = cleaned_text.find(line) # 找到匹配行的起始位置
|
|
|
|
|
# 从匹配到的行的下一行开始截取
|
|
|
|
|
cleaned_text = cleaned_text[cleaned_text.find(lines[idx + 1]):] if idx + 1 < len(lines) else ""
|
|
|
|
|
break
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-11-11 17:12:38 +08:00
|
|
|
|
# 在最后一页查找结束位置
|
|
|
|
|
if i == len(pdf_document.pages) - 1:
|
2025-01-14 17:10:38 +08:00
|
|
|
|
matches = list(regex.finditer(end_pattern, cleaned_text, regex.MULTILINE))
|
2024-11-11 17:12:38 +08:00
|
|
|
|
if matches:
|
|
|
|
|
end_index = matches[-1].start()
|
|
|
|
|
cleaned_text = cleaned_text[:end_index]
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-12-09 17:38:01 +08:00
|
|
|
|
lines = cleaned_text.split('\n')
|
|
|
|
|
for j, line in enumerate(lines):
|
|
|
|
|
if j == 0:
|
|
|
|
|
lines[j] = '##' + line
|
|
|
|
|
cleaned_text = '\n'.join(lines)
|
|
|
|
|
|
2024-11-11 17:12:38 +08:00
|
|
|
|
all_pages_text.append(cleaned_text)
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-11-11 17:12:38 +08:00
|
|
|
|
# 合并所有页面的文本
|
|
|
|
|
full_text = "\n".join(all_pages_text)
|
2024-11-13 17:24:17 +08:00
|
|
|
|
return full_text
|
|
|
|
|
|
2025-01-17 10:47:43 +08:00
|
|
|
|
|
|
|
|
|
def convert_clause_to_json(file_path, output_folder, type=1):
|
2025-01-14 17:10:38 +08:00
|
|
|
|
if not os.path.exists(file_path):
|
|
|
|
|
print(f"The specified file does not exist: 返回空的clause_path")
|
|
|
|
|
return ""
|
2025-01-16 17:22:49 +08:00
|
|
|
|
if type == 2:
|
|
|
|
|
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$'
|
2025-01-17 10:47:43 +08:00
|
|
|
|
start_pattern_2 = None
|
2025-01-16 17:22:49 +08:00
|
|
|
|
end_pattern = (
|
|
|
|
|
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
|
|
|
|
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
|
|
|
|
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$' # 第二部分
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
start_pattern_1 = (
|
|
|
|
|
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
|
|
|
|
|
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
|
|
|
|
|
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
|
|
|
|
|
)
|
|
|
|
|
start_pattern_2 = (
|
|
|
|
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
|
2025-01-17 10:47:43 +08:00
|
|
|
|
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
|
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
|
2025-01-14 17:10:38 +08:00
|
|
|
|
)
|
2025-01-17 10:47:43 +08:00
|
|
|
|
|
2025-01-14 17:10:38 +08:00
|
|
|
|
end_pattern = (
|
|
|
|
|
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
|
|
|
|
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
|
|
|
|
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[::]|'
|
|
|
|
|
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[::]|'
|
|
|
|
|
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[::]'
|
|
|
|
|
)
|
|
|
|
|
if file_path.endswith('.pdf'):
|
2025-01-16 17:22:49 +08:00
|
|
|
|
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
|
2025-01-17 09:08:50 +08:00
|
|
|
|
# print(text)
|
2025-01-14 17:10:38 +08:00
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Unsupported file format")
|
2025-01-16 17:22:49 +08:00
|
|
|
|
parsed_data = parse_text_by_heading(text)
|
|
|
|
|
# result = convert_to_json(input_path, start_word, end_pattern)
|
|
|
|
|
# 检查输出文件夹是否存在,如果不存在则创建
|
|
|
|
|
if not os.path.exists(output_folder):
|
|
|
|
|
os.makedirs(output_folder)
|
|
|
|
|
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
|
|
|
|
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
|
|
|
|
# file_name = f"clause{suffix_counter}.json"
|
|
|
|
|
output_path = os.path.join(output_folder, file_name)
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
|
|
|
|
|
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
|
|
|
|
return output_path
|
2025-01-14 17:10:38 +08:00
|
|
|
|
|
2025-01-17 10:47:43 +08:00
|
|
|
|
|
2025-01-14 17:10:38 +08:00
|
|
|
|
if __name__ == "__main__":
|
2025-01-21 14:21:49 +08:00
|
|
|
|
file_path = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\ztbfile_tobidders_notice_part2.pdf'
|
2025-01-14 17:10:38 +08:00
|
|
|
|
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
|
|
|
|
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
|
|
|
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
2025-01-21 14:21:49 +08:00
|
|
|
|
output_folder = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp'
|
2025-01-14 17:10:38 +08:00
|
|
|
|
try:
|
2025-01-17 10:47:43 +08:00
|
|
|
|
output_path = convert_clause_to_json(file_path, output_folder)
|
2025-01-14 17:10:38 +08:00
|
|
|
|
print(f"Final JSON result saved to: {output_path}")
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
print("Error:", e)
|