zbparse/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py

503 lines
24 KiB
Python
Raw Normal View History

2024-09-28 17:38:53 +08:00
import json
import docx
import re
import os
2024-10-09 13:50:28 +08:00
import fitz
2024-09-28 17:38:53 +08:00
from PyPDF2 import PdfReader
2024-10-23 20:33:41 +08:00
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
2024-09-28 17:38:53 +08:00
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
2024-10-09 13:50:28 +08:00
#PYPDF2版本
2024-09-28 17:38:53 +08:00
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
2024-11-04 17:13:06 +08:00
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
2024-09-28 17:38:53 +08:00
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
2024-10-22 21:02:54 +08:00
# print(cleaned_text)
2024-09-28 17:38:53 +08:00
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
2024-10-12 18:01:59 +08:00
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
2024-09-28 17:38:53 +08:00
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text
2024-10-09 13:50:28 +08:00
#fitz库版本
# def extract_text_from_pdf(file_path, start_word, end_pattern):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# doc = fitz.open(file_path)
# all_pages_text = []
# start_index = None
#
# # 处理所有页面
# for i in range(len(doc)):
# page = doc[i]
# page_text = page.get_text()
# cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# print("yes")
# # 在第一页查找开始位置
# if i == 0 and start_index is None:
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
# if start_match:
# start_index = start_match.start()
# cleaned_text = cleaned_text[start_index:]
#
# # 在最后一页查找结束位置
# if i == len(doc) - 1:
# for pattern in end_pattern:
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
# if matches:
# end_index = matches[-1].start()
# cleaned_text = cleaned_text[:end_index]
# break
#
# all_pages_text.append(cleaned_text)
#
# # 合并所有页面的文本
# full_text = "\n".join(all_pages_text)
# # 关闭文档
# doc.close()
#
# return full_text
2024-09-28 17:38:53 +08:00
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
2024-11-06 17:29:45 +08:00
def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
2024-09-28 17:38:53 +08:00
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
2024-11-06 17:29:45 +08:00
if in_special_section:
current_content.append('\n')
2024-09-28 17:38:53 +08:00
return append_newline
"""
保存换行符的具体逻辑
对于二级标题 1.1如果其后的内容包含关键词或内容较短<=20字符会在内容前添加一个换行符
这个换行符会被保留在 current_content 列表中
当处理下一个标题时之前的内容包括可能存在的换行符会被合并并保存到 data 字典中
解决了''''这类标题出现在正文中的情况但是目前的逻辑是如果''已有了就加入正文否则''作为新的标题
2024-09-28 17:38:53 +08:00
"""
#提取json主函数
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
2024-09-28 17:38:53 +08:00
current_content = []
append_newline = False
2024-10-09 13:50:28 +08:00
skip_subheadings = False
last_main_number = None
2024-11-04 17:13:06 +08:00
temp_title = None # 临时存储以点号开头但不带数字的标题
2024-11-06 17:29:45 +08:00
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中
2024-10-09 13:50:28 +08:00
lines = text.split('\n')
2024-11-04 17:13:06 +08:00
def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
2024-10-09 13:50:28 +08:00
for i, line in enumerate(lines):
line_stripped = line.strip().replace('', '.')
2024-11-04 17:13:06 +08:00
# 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
2024-11-06 17:29:45 +08:00
# **首先检查是否进入特殊章节**
if any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = True
2024-11-04 17:13:06 +08:00
# 匹配带数字的标题,例如 '12.1 内容'
2024-09-28 17:38:53 +08:00
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
2024-11-06 17:29:45 +08:00
# 检查是否退出特殊章节
if in_special_section:
# 如果匹配到了主要标题,且不包含特殊关键词,则退出特殊章节
if match and not any(keyword in line_stripped for keyword in special_section_keywords):
in_special_section = False
# 以下是原有的匹配逻辑
2024-11-04 17:13:06 +08:00
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.]\s*(\D.+)$', line_stripped)
2024-11-04 17:13:06 +08:00
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
2024-10-22 21:02:54 +08:00
pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
2024-09-28 17:38:53 +08:00
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,')
2024-10-09 13:50:28 +08:00
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
2024-11-04 17:13:06 +08:00
# 处理临时标题与新主标题的关联
if temp_title:
# 提取主编号(第一个点前的数字)
main_number = new_key.split('.')[0]
# 关联临时标题到主编号
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 保存阿拉伯数字的标题内容
2024-09-28 17:38:53 +08:00
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
2024-09-28 17:38:53 +08:00
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0]
else:
2024-11-06 17:29:45 +08:00
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
2024-11-04 17:13:06 +08:00
elif dot_match:
2024-11-04 17:13:06 +08:00
# 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups()
2024-11-04 17:13:06 +08:00
sub_number = sub_number.replace('', '.')
if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
append_newline = True
2024-09-28 17:38:53 +08:00
else:
2024-11-06 17:29:45 +08:00
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
2024-11-04 17:13:06 +08:00
elif dot_text_match:
# 处理以点号开头但不带数字的情况,存储到临时变量
temp_title = dot_text_match.group(1).strip()
continue # 跳过进一步处理该行
2024-11-06 17:29:45 +08:00
elif pure_number_match:
# 处理不带点号的纯数字开头的情况
2024-10-22 21:02:54 +08:00
new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,')
# 提取当前键名和新键名的数值部分
def extract_number(key):
return float(key.rstrip('.').split('.')[0])
current_key_num = extract_number(current_key) if current_key else 0
new_key_num = extract_number(new_key_candidate)
# 判断新键名是否为新的标题条件是键名递增且差值在5以内
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
2024-11-04 17:13:06 +08:00
# 处理临时标题与新主标题的关联
if temp_title:
main_number = new_key_candidate.split('.')[0]
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
2024-10-22 21:02:54 +08:00
# 开始新的标题
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
current_key = new_key_candidate
current_content = [line_content]
append_newline = True
last_main_number = new_key_candidate.rstrip('.')
else:
# 将当前行视为当前标题的内容
2024-11-06 17:29:45 +08:00
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
2024-10-22 21:02:54 +08:00
2024-09-28 17:38:53 +08:00
else:
2024-11-06 17:29:45 +08:00
if not skip_subheadings and not in_special_section: # 增加 in_special_section 判断
numbered_match = pattern_numbered.match(line_stripped)
parenthesis_match = pattern_parentheses.match(line_stripped)
2024-10-09 13:50:28 +08:00
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
2024-11-06 17:29:45 +08:00
# 判断当前行是否匹配了任何标题模式
if numbered_match or parenthesis_match or letter_match or arabic_match:
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if initial_heading_pattern is None:
if numbered_match:
initial_heading_pattern = 'numbered'
elif parenthesis_match:
initial_heading_pattern = 'parentheses'
elif letter_match:
initial_heading_pattern = 'letter'
elif arabic_match:
initial_heading_pattern = 'arabic'
# 确定当前匹配的标题模式
if numbered_match:
current_heading_pattern = 'numbered'
elif parenthesis_match:
current_heading_pattern = 'parentheses'
2024-10-09 13:50:28 +08:00
elif letter_match:
2024-11-06 17:29:45 +08:00
current_heading_pattern = 'letter'
2024-10-09 13:50:28 +08:00
elif arabic_match:
2024-11-06 17:29:45 +08:00
current_heading_pattern = 'arabic'
# 如果当前标题模式与初始标题模式一致,创建新的键值对
if current_heading_pattern == initial_heading_pattern:
# 保存之前的 key 的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
2024-10-09 13:50:28 +08:00
2024-11-06 17:29:45 +08:00
# 处理匹配到的标题
if current_heading_pattern == 'numbered':
current_key_chinese = numbered_match.group(1)
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses':
current_key_chinese = parenthesis_match.group(1)
current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter':
# 字母标题处理
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'arabic':
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
last_main_number = current_key.rstrip('.')
continue
else:
# 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
else:
# 未匹配到任何标题模式,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
else:
# 在特殊章节中,所有内容都作为当前标题的内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
2024-09-28 17:38:53 +08:00
# 最后保存最后一个 key 对应的内容
2024-09-28 17:38:53 +08:00
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
2024-11-04 17:13:06 +08:00
if temp_title:
# 处理任何未关联的临时标题
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
2024-09-28 17:38:53 +08:00
return data
2024-10-12 18:01:59 +08:00
#type=2时提取货物标的第一章招标公告时采用该逻辑
def parse_text_to_dict(text):
"""
解析文本根据大标题划分内容生成字典
参数:
text (str): 要解析的文本
返回:
dict: 大标题作为键内容作为值的字典
"""
# 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首
pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE)
# 使用 re.finditer 找到所有大标题的位置
matches = list(pattern.finditer(text))
result = {}
for i, match in enumerate(matches):
title = match.group(1).strip() # 获取大标题文本
start = match.end() # 内容的起始位置
if i + 1 < len(matches):
end = matches[i + 1].start() # 下一个大标题的起始位置
else:
end = len(text) # 最后一个大标题,内容到文本末尾
content = text[start:end].strip() # 获取内容并去除前后空白
# 规范化换行符,并移除每行开头和结尾的空白
content = content.replace('\r\n', '\n') # 统一换行符
content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白
content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白
content = clean_content(content) # 处理内容中的换行符
result[title] = content
return result
def clean_content(content):
"""
处理内容中的换行符
- 保留在子项编号前的换行符
- 保留在冒号 ':' 或全角冒号 '' 前的第一个换行符
- 移除其他位置的换行符不留下额外的空格
参数:
content (str): 要处理的内容字符串
返回:
str: 处理后的内容字符串
"""
# 定义子项编号的正则模式,包括:
# - 数字+点号+数字(如 1.1 或 11
# - 数字+顿号(如 2、
# - 点号+数字(如 .3 或 3
# - 数字+右括号(如 1) 或 1
# - 圆括号包围的数字(如 (5)
# - 全角圆括号包围的数字(如 5
# - 数字+点号(如 1. 或 1
numbering_pattern = r'(?:\d+[.]\d+(?:[.]\d+)*|\d+、|[.]\d+|\d+[)]|\(\d+\)|\d+|\d+[.])'
# 定义需要保留换行符的情况:
# 1. 换行符前面是子项编号
# 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ''
pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[:]))'
# 定义占位符,用于暂时替换需要保留的换行符
placeholder = "___PLACEHOLDER___"
# Step 1: 将需要保留的换行符替换为占位符
content_with_placeholder = re.sub(pattern_keep, placeholder, content)
# Step 2: 移除所有剩余的换行符
content_no_newlines = content_with_placeholder.replace('\n', '')
# Step 3: 将占位符替换回换行符
cleaned_content = content_no_newlines.replace(placeholder, '\n')
return cleaned_content
2024-10-17 15:33:58 +08:00
#如果file_path为空返回""
2024-10-09 13:50:28 +08:00
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
2024-09-30 16:23:39 +08:00
if not os.path.exists(file_path):
print(f"The specified file does not exist: {file_path}")
2024-09-28 17:38:53 +08:00
return ""
if type == 1:
2024-11-04 17:13:06 +08:00
start_word = r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
2024-10-12 18:01:59 +08:00
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result = parse_text_by_heading(text)
2024-09-28 17:38:53 +08:00
else:
2024-11-04 17:13:06 +08:00
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*)$'
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
2024-10-12 18:01:59 +08:00
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result=parse_text_to_dict(text)
2024-09-28 17:38:53 +08:00
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
2024-10-12 18:01:59 +08:00
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
2024-09-28 17:38:53 +08:00
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
2024-09-30 16:23:39 +08:00
def process_folder(input_folder, output_folder):
# 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件
files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
# 遍历文件并对每个文件进行处理
for file_name in files:
file_path = os.path.join(input_folder, file_name)
# 去掉文件扩展名
file_name_without_extension = os.path.splitext(file_name)[0]
2024-09-30 16:23:39 +08:00
try:
# 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀
output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension)
2024-09-30 16:23:39 +08:00
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
except ValueError as e:
print(f"Error processing {file_name}: {e}")
2024-10-09 13:50:28 +08:00
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
2024-10-16 20:18:55 +08:00
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
2024-11-06 17:29:45 +08:00
#TODO:11.6 目前 '文件的组成' 是匹配任意行,可以考虑只匹配'11.文件的组成' 前面有序号的行 a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf还有问题
2024-09-28 17:38:53 +08:00
if __name__ == "__main__":
2024-10-12 18:01:59 +08:00
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
2024-11-06 17:29:45 +08:00
file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf'
2024-10-12 18:01:59 +08:00
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
2024-11-06 17:29:45 +08:00
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\tmp'
2024-10-12 18:01:59 +08:00
try:
2024-10-16 20:18:55 +08:00
output_path = convert_clause_to_json(file_path,output_folder,1)
2024-10-12 18:01:59 +08:00
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)