2024-08-29 16:37:09 +08:00
|
|
|
|
import json
|
|
|
|
|
import docx
|
|
|
|
|
import re
|
|
|
|
|
import os
|
2024-09-18 11:57:17 +08:00
|
|
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
from flask_app.main.截取pdf import clean_page_content,extract_common_header
|
2024-09-11 12:02:09 +08:00
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
def extract_text_from_docx(file_path):
|
|
|
|
|
doc = docx.Document(file_path)
|
|
|
|
|
return '\n'.join([para.text for para in doc.paragraphs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file_path):
|
2024-09-18 11:57:17 +08:00
|
|
|
|
# 从PDF文件中提取文本
|
|
|
|
|
common_header = extract_common_header(file_path)
|
|
|
|
|
pdf_document = PdfReader(file_path)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
text = ""
|
2024-09-18 11:57:17 +08:00
|
|
|
|
# 遍历每一页
|
|
|
|
|
for page in pdf_document.pages:
|
|
|
|
|
# 提取当前页面的文本
|
|
|
|
|
page_text = page.extract_text() if page.extract_text() else ""
|
|
|
|
|
# 清洗页面文本
|
|
|
|
|
page_text = clean_page_content(page_text, common_header)
|
|
|
|
|
# 将清洗后的文本添加到总文本中
|
|
|
|
|
text += page_text+"\n"
|
2024-08-29 16:37:09 +08:00
|
|
|
|
return text
|
|
|
|
|
|
2024-09-06 15:50:52 +08:00
|
|
|
|
def extract_section(text, start_pattern, end_phrases):
|
2024-09-19 11:33:17 +08:00
|
|
|
|
# 查找开始模式
|
2024-09-06 15:50:52 +08:00
|
|
|
|
start_match = re.search(start_pattern, text)
|
|
|
|
|
if not start_match:
|
|
|
|
|
return "" # 如果没有找到匹配的开始模式,返回空字符串
|
2024-09-19 11:33:17 +08:00
|
|
|
|
start_index = start_match.end() # 从匹配的结束位置开始
|
|
|
|
|
|
|
|
|
|
# 初始化结束索引为文本总长度
|
2024-08-29 16:37:09 +08:00
|
|
|
|
end_index = len(text)
|
2024-09-19 11:33:17 +08:00
|
|
|
|
|
|
|
|
|
# 遍历所有结束短语,查找第一个出现的结束短语
|
2024-08-29 16:37:09 +08:00
|
|
|
|
for phrase in end_phrases:
|
2024-09-19 11:33:17 +08:00
|
|
|
|
match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
if match:
|
2024-09-19 11:33:17 +08:00
|
|
|
|
end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置
|
|
|
|
|
break # 找到第一个匹配后立即停止搜索
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
2024-09-19 11:33:17 +08:00
|
|
|
|
# 提取并返回从开始模式后到结束模式前的内容
|
2024-08-29 16:37:09 +08:00
|
|
|
|
return text[start_index:end_index]
|
|
|
|
|
|
|
|
|
|
def compare_headings(current, new):
|
|
|
|
|
# 使用过滤来确保只处理非空且为数字的部分
|
|
|
|
|
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
|
|
|
|
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
|
|
|
|
|
|
|
|
|
|
# 比较数字序列以确定标题的层次关系
|
|
|
|
|
for c, n in zip(current_nums, new_nums):
|
|
|
|
|
if n > c:
|
|
|
|
|
return True
|
|
|
|
|
elif n < c:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 如果新标题有更多层次,认为是新的子章节
|
|
|
|
|
return len(new_nums) > len(current_nums)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def should_add_newline(content, keywords, max_length=20):
|
|
|
|
|
content_str = ''.join(content).strip()
|
|
|
|
|
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
|
|
|
|
|
|
|
|
|
def handle_content_append(current_content, line_content, append_newline, keywords):
|
|
|
|
|
if append_newline:
|
|
|
|
|
if should_add_newline(current_content, keywords):
|
|
|
|
|
current_content.append('\n') # 添加换行符
|
|
|
|
|
append_newline = False
|
|
|
|
|
current_content.append(line_content)
|
|
|
|
|
return append_newline
|
|
|
|
|
|
|
|
|
|
#对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。
|
|
|
|
|
def parse_text_by_heading(text):
|
|
|
|
|
keywords = ['包含', '以下']
|
|
|
|
|
data = {}
|
|
|
|
|
current_key = None
|
|
|
|
|
current_content = []
|
|
|
|
|
append_newline = False
|
2024-09-25 10:45:32 +08:00
|
|
|
|
sub_key_count = 0 # 用于生成二级标题的计数器
|
|
|
|
|
third_key_count = 0 # 用于生成三级标题的计数器
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
lines = text.split('\n')
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
line_stripped = line.strip()
|
|
|
|
|
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
|
|
|
|
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
|
|
|
|
if not match:
|
|
|
|
|
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
|
|
|
|
|
|
|
|
|
if match:
|
|
|
|
|
new_key, line_content = match.groups()
|
|
|
|
|
line_content = line_content.lstrip('.')
|
|
|
|
|
# 检查是否应该更新当前键和内容
|
|
|
|
|
if current_key is None or (compare_headings(current_key, new_key) and (
|
|
|
|
|
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
|
|
|
|
if current_key is not None:
|
2024-09-25 10:45:32 +08:00
|
|
|
|
# 将之前的内容保存到 data 中
|
2024-08-29 16:37:09 +08:00
|
|
|
|
content_string = ''.join(current_content).strip()
|
|
|
|
|
data[current_key] = content_string.replace(' ', '')
|
|
|
|
|
current_key = new_key
|
|
|
|
|
current_content = [line_content]
|
2024-09-25 10:45:32 +08:00
|
|
|
|
# 初始化二级和三级标题的计数器
|
|
|
|
|
sub_key_count = 0
|
|
|
|
|
third_key_count = 0
|
2024-08-29 16:37:09 +08:00
|
|
|
|
append_newline = len(new_key.split('.')) == 2
|
|
|
|
|
else:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
|
|
|
|
else:
|
2024-09-25 10:45:32 +08:00
|
|
|
|
# 匹配子序号 '1. 2.' 生成三级标题
|
|
|
|
|
sub_match = re.match(r'^(\d+)[\.\、]\s*(.*)$', line_stripped)
|
|
|
|
|
if sub_match:
|
|
|
|
|
sub_number, sub_content = sub_match.groups()
|
|
|
|
|
# 生成三级标题 key,如 5.0.1, 5.0.2 等
|
|
|
|
|
if current_key:
|
|
|
|
|
if sub_key_count == 0:
|
|
|
|
|
# 生成二级标题,如 5.0
|
|
|
|
|
sub_key_count += 1
|
|
|
|
|
sub_key = f"{current_key}.0"
|
|
|
|
|
data[sub_key] = ''.join(current_content).strip()
|
|
|
|
|
current_content = [] # 重置内容以生成新的内容
|
|
|
|
|
third_key_count += 1
|
|
|
|
|
sub_key_with_third = f"{current_key}.0.{third_key_count}"
|
|
|
|
|
data[sub_key_with_third] = sub_content.strip()
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
# 处理一般行内容
|
|
|
|
|
if line_stripped:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
if current_key is not None:
|
|
|
|
|
# 保存最后一部分内容
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-09-25 10:45:32 +08:00
|
|
|
|
if content_string:
|
|
|
|
|
data[current_key] = content_string.replace(' ', '')
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
2024-09-25 10:45:32 +08:00
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
def convert_to_json(file_path, start_word, end_phrases):
|
|
|
|
|
if file_path.endswith('.docx'):
|
|
|
|
|
text = extract_text_from_docx(file_path)
|
|
|
|
|
elif file_path.endswith('.pdf'):
|
|
|
|
|
text = extract_text_from_pdf(file_path)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Unsupported file format")
|
|
|
|
|
# 提取从 start_word 开始到 end_phrases 结束的内容
|
|
|
|
|
text = extract_section(text, start_word, end_phrases)
|
2024-09-19 11:33:17 +08:00
|
|
|
|
# print(text)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
parsed_data = parse_text_by_heading(text)
|
|
|
|
|
return parsed_data
|
|
|
|
|
|
2024-09-06 15:50:52 +08:00
|
|
|
|
def convert_clause_to_json(input_path,output_folder,type=1):
|
2024-09-06 17:00:35 +08:00
|
|
|
|
if not os.path.exists(input_path):
|
2024-09-13 15:03:55 +08:00
|
|
|
|
print(f"The specified file does not exist: {input_path}")
|
2024-09-06 17:00:35 +08:00
|
|
|
|
return ""
|
2024-09-06 15:50:52 +08:00
|
|
|
|
if type==1:
|
|
|
|
|
start_word = "投标人须知正文"
|
|
|
|
|
end_phrases = [
|
2024-09-19 11:33:17 +08:00
|
|
|
|
r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
2024-09-06 15:50:52 +08:00
|
|
|
|
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
|
|
|
|
]
|
|
|
|
|
else:
|
|
|
|
|
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
|
|
|
|
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
2024-08-29 16:37:09 +08:00
|
|
|
|
result = convert_to_json(input_path, start_word, end_phrases)
|
2024-09-06 15:50:52 +08:00
|
|
|
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
|
|
|
|
output_path = os.path.join(output_folder, file_name)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(result, f, indent=4, ensure_ascii=False)
|
2024-09-19 11:33:17 +08:00
|
|
|
|
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-09-19 11:33:17 +08:00
|
|
|
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
2024-09-25 10:07:27 +08:00
|
|
|
|
file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
|
2024-09-19 11:33:17 +08:00
|
|
|
|
# start_word = "投标人须知正文"
|
|
|
|
|
# end_phrases = [
|
|
|
|
|
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
|
|
|
|
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
|
|
|
|
# ]
|
2024-09-25 10:07:27 +08:00
|
|
|
|
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp'
|
2024-08-29 16:37:09 +08:00
|
|
|
|
try:
|
|
|
|
|
output_path = convert_clause_to_json(file_path,output_folder)
|
|
|
|
|
print(f"Final JSON result saved to: {output_path}")
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
print("Error:", e)
|