zbparse/flask_app/main/投标人须知正文条款提取成json文件.py

246 lines
11 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
import json
import docx
import re
import os
2024-09-18 11:57:17 +08:00
from PyPDF2 import PdfReader
from flask_app.main.截取pdf import clean_page_content,extract_common_header
2024-08-29 16:37:09 +08:00
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
2024-09-28 17:38:53 +08:00
# def extract_text_from_pdf(file_path):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# pdf_document = PdfReader(file_path)
# text = ""
# # 遍历每一页
# for page in pdf_document.pages:
# # 提取当前页面的文本
# page_text = page.extract_text() if page.extract_text() else ""
# # 清洗页面文本
# page_text = clean_page_content(page_text, common_header)
# # 将清洗后的文本添加到总文本中
# text += page_text+"\n"
# return text
def extract_text_from_pdf(file_path, start_word, end_pattern):
2024-09-18 11:57:17 +08:00
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
2024-09-28 17:38:53 +08:00
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
2024-09-18 11:57:17 +08:00
page_text = page.extract_text() if page.extract_text() else ""
2024-09-28 17:38:53 +08:00
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
for pattern in end_pattern:
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
break
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
# print(full_text)
return full_text
def extract_section(text, start_pattern, end_phrases):
# 查找开始模式
start_match = re.search(start_pattern, text)
if not start_match:
return "" # 如果没有找到匹配的开始模式,返回空字符串
start_index = start_match.end() # 从匹配的结束位置开始
# 初始化结束索引为文本总长度
2024-08-29 16:37:09 +08:00
end_index = len(text)
# 遍历所有结束短语,查找第一个出现的结束短语
2024-08-29 16:37:09 +08:00
for phrase in end_phrases:
match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
2024-08-29 16:37:09 +08:00
if match:
end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置
break # 找到第一个匹配后立即停止搜索
2024-08-29 16:37:09 +08:00
# 提取并返回从开始模式后到结束模式前的内容
2024-08-29 16:37:09 +08:00
return text[start_index:end_index]
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
#对二级标题如x.x进行额外处理如果当前处理内容包含keywords中的内容则必须保留换行符/如果当前内容字数大于20不保留换行。
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
2024-09-28 17:38:53 +08:00
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
2024-08-29 16:37:09 +08:00
line_stripped = line.strip()
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and (
2024-09-28 17:38:53 +08:00
len(current_content) == 0 or current_content[-1][-1] != '')): #current_content 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']
2024-08-29 16:37:09 +08:00
if current_key is not None:
2024-09-25 15:58:27 +08:00
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
2024-08-29 16:37:09 +08:00
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
2024-09-25 15:58:27 +08:00
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
# append_newline = len(new_key.split('.')) == 2
append_newline = len(new_key.rstrip('.').split('.')) <= 2
2024-08-29 16:37:09 +08:00
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
2024-09-25 15:58:27 +08:00
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
2024-08-29 16:37:09 +08:00
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
2024-09-25 15:58:27 +08:00
data[current_key] = content_string.replace(' ', '')
2024-08-29 16:37:09 +08:00
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif file_path.endswith('.pdf'):
2024-09-28 17:38:53 +08:00
text = extract_text_from_pdf(file_path,start_word,end_phrases)
2024-08-29 16:37:09 +08:00
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
2024-09-28 17:38:53 +08:00
# text = extract_section(text, start_word, end_phrases)
# print(text)
2024-08-29 16:37:09 +08:00
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path,output_folder,type=1):
if not os.path.exists(input_path):
2024-09-13 15:03:55 +08:00
print(f"The specified file does not exist: {input_path}")
return ""
if type==1:
start_word = "投标人须知正文"
end_phrases = [
2024-09-28 17:38:53 +08:00
r'^第[一二三四五六七八九十]+章\s*评标办法',
r'^评标办法前附表',
r'^附(?:录|件|表)(?:一)?[:]'
]
else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
2024-08-29 16:37:09 +08:00
result = convert_to_json(input_path, start_word, end_phrases)
2024-09-28 17:38:53 +08:00
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
2024-08-29 16:37:09 +08:00
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
# post_process_json(output_path)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
2024-08-29 16:37:09 +08:00
return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# # 读取 JSON 文件
# with open(json_file_path, 'r', encoding='utf-8') as file:
# data = json.load(file)
# processed_data = {}
# for key, value in data.items():
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# # 分割标题和正文
# title, content = value.split('\n', 1)
#
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
# processed_data[key] = title.strip()
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
#
# processed_data[sub_key] = title.strip()
#
# # 初始化计数器
# sub_count = 1
#
# # 根据子序号 '1.' 或 '1、' 进行分割
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
#
# current_sub_content = ""
# for i in range(1, len(sub_sections), 2):
# sub_number = sub_sections[i].strip() # 获取子序号
# sub_content = sub_sections[i + 1].strip() # 获取内容
#
# # 生成三级标题,如 '5.0.1', '5.0.2'
# sub_key_with_number = f"{sub_key}.{sub_count}"
# processed_data[sub_key_with_number] = sub_content
# sub_count += 1
#
# else:
# # 如果没有分割需求,保留原数据
# processed_data[key] = value
2024-09-25 15:58:27 +08:00
2024-09-30 16:23:39 +08:00
# # 将修改后的数据重新写入到原来的 JSON 文件中
# with open(json_file_path, 'w', encoding='utf-8') as file:
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
2024-09-25 15:58:27 +08:00
2024-08-29 16:37:09 +08:00
if __name__ == "__main__":
2024-09-28 17:38:53 +08:00
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf'
# start_word = "投标人须知正文"
# end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ]
2024-09-28 17:38:53 +08:00
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp'
2024-08-29 16:37:09 +08:00
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)