zbparse/flask_app/main/投标人须知正文条款提取成json文件.py
2024-09-30 16:23:39 +08:00

246 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import docx
import re
import os
from PyPDF2 import PdfReader
from flask_app.main.截取pdf import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
# def extract_text_from_pdf(file_path):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# pdf_document = PdfReader(file_path)
# text = ""
# # 遍历每一页
# for page in pdf_document.pages:
# # 提取当前页面的文本
# page_text = page.extract_text() if page.extract_text() else ""
# # 清洗页面文本
# page_text = clean_page_content(page_text, common_header)
# # 将清洗后的文本添加到总文本中
# text += page_text+"\n"
# return text
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
for pattern in end_pattern:
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
break
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
# print(full_text)
return full_text
def extract_section(text, start_pattern, end_phrases):
# 查找开始模式
start_match = re.search(start_pattern, text)
if not start_match:
return "" # 如果没有找到匹配的开始模式,返回空字符串
start_index = start_match.end() # 从匹配的结束位置开始
# 初始化结束索引为文本总长度
end_index = len(text)
# 遍历所有结束短语,查找第一个出现的结束短语
for phrase in end_phrases:
match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
if match:
end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置
break # 找到第一个匹配后立即停止搜索
# 提取并返回从开始模式后到结束模式前的内容
return text[start_index:end_index]
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
#对二级标题如x.x进行额外处理如果当前处理内容包含keywords中的内容则必须保留换行符/如果当前内容字数大于20不保留换行。
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
line_stripped = line.strip()
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')): #current_content 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']
if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
# append_newline = len(new_key.split('.')) == 2
append_newline = len(new_key.rstrip('.').split('.')) <= 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path,start_word,end_phrases)
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
# text = extract_section(text, start_word, end_phrases)
# print(text)
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path,output_folder,type=1):
if not os.path.exists(input_path):
print(f"The specified file does not exist: {input_path}")
return ""
if type==1:
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s*评标办法',
r'^评标办法前附表',
r'^附(?:录|件|表)(?:一)?[:]'
]
else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
result = convert_to_json(input_path, start_word, end_phrases)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
# post_process_json(output_path)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# # 读取 JSON 文件
# with open(json_file_path, 'r', encoding='utf-8') as file:
# data = json.load(file)
# processed_data = {}
# for key, value in data.items():
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# # 分割标题和正文
# title, content = value.split('\n', 1)
#
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
# processed_data[key] = title.strip()
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
#
# processed_data[sub_key] = title.strip()
#
# # 初始化计数器
# sub_count = 1
#
# # 根据子序号 '1.' 或 '1、' 进行分割
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
#
# current_sub_content = ""
# for i in range(1, len(sub_sections), 2):
# sub_number = sub_sections[i].strip() # 获取子序号
# sub_content = sub_sections[i + 1].strip() # 获取内容
#
# # 生成三级标题,如 '5.0.1', '5.0.2'
# sub_key_with_number = f"{sub_key}.{sub_count}"
# processed_data[sub_key_with_number] = sub_content
# sub_count += 1
#
# else:
# # 如果没有分割需求,保留原数据
# processed_data[key] = value
# # 将修改后的数据重新写入到原来的 JSON 文件中
# with open(json_file_path, 'w', encoding='utf-8') as file:
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf'
# start_word = "投标人须知正文"
# end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ]
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)