2024-09-27 18:07:34 +08:00

211 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import docx
import re
import os
from PyPDF2 import PdfReader
from flask_app.main.截取pdf import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
def extract_text_from_pdf(file_path):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
text = ""
# 遍历每一页
for page in pdf_document.pages:
# 提取当前页面的文本
page_text = page.extract_text() if page.extract_text() else ""
# 清洗页面文本
page_text = clean_page_content(page_text, common_header)
# 将清洗后的文本添加到总文本中
text += page_text+"\n"
return text
def extract_section(text, start_pattern, end_phrases):
# 查找开始模式
start_match = re.search(start_pattern, text)
if not start_match:
return "" # 如果没有找到匹配的开始模式,返回空字符串
start_index = start_match.end() # 从匹配的结束位置开始
# 初始化结束索引为文本总长度
end_index = len(text)
# 遍历所有结束短语,查找第一个出现的结束短语
for phrase in end_phrases:
match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
if match:
end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置
break # 找到第一个匹配后立即停止搜索
# 提取并返回从开始模式后到结束模式前的内容
return text[start_index:end_index]
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
#对二级标题如x.x进行额外处理如果当前处理内容包含keywords中的内容则必须保留换行符/如果当前内容字数大于20不保留换行。
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
for i, line in enumerate(lines):
line_stripped = line.strip()
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
append_newline = len(new_key.split('.')) == 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
text = extract_section(text, start_word, end_phrases)
# print(text)
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path,output_folder,type=1):
if not os.path.exists(input_path):
print(f"The specified file does not exist: {input_path}")
return ""
if type==1:
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
result = convert_to_json(input_path, start_word, end_phrases)
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
post_process_json(output_path)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# 读取 JSON 文件
with open(json_file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
processed_data = {}
for key, value in data.items():
# 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# 分割标题和正文
title, content = value.split('\n', 1)
# 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
processed_data[key] = title.strip()
sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
processed_data[sub_key] = title.strip()
# 初始化计数器
sub_count = 1
# 根据子序号 '1.' 或 '1、' 进行分割
sub_sections = re.split(r'(\d+[\.\、])\s*', content)
current_sub_content = ""
for i in range(1, len(sub_sections), 2):
sub_number = sub_sections[i].strip() # 获取子序号
sub_content = sub_sections[i + 1].strip() # 获取内容
# 生成三级标题,如 '5.0.1', '5.0.2'
sub_key_with_number = f"{sub_key}.{sub_count}"
processed_data[sub_key_with_number] = sub_content
sub_count += 1
else:
# 如果没有分割需求,保留原数据
processed_data[key] = value
# 将修改后的数据重新写入到原来的 JSON 文件中
with open(json_file_path, 'w', encoding='utf-8') as file:
json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件广水市教育局封闭管理_qualification1.pdf'
# start_word = "投标人须知正文"
# end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ]
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)