zbparse/flask_app/工程标/提取json工程标版.py

105 lines
4.7 KiB
Python
Raw Normal View History

2024-11-11 17:12:38 +08:00
import json
import os
from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf
def convert_clause_to_json(file_path, output_folder, type=1):
if not os.path.exists(file_path):
return ""
if type == 1:
# 对于 type=1使用原始字符串定义 start_word 和 end_pattern
start_word = (
2024-12-09 17:38:01 +08:00
r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
2024-11-11 17:12:38 +08:00
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
)
end_pattern = (
2024-11-13 17:24:17 +08:00
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
2024-12-24 17:32:00 +08:00
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
2024-11-13 17:24:17 +08:00
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
2024-11-11 17:12:38 +08:00
r'评标办法前附表|'
r'附录(?:一)?[:]|'
r'附件(?:一)?[:]|'
2024-11-13 17:24:17 +08:00
r'附表(?:一)?[:]'
2024-11-11 17:12:38 +08:00
)
else:
# 对于其他类型,使用原始字符串定义 start_word 和 end_pattern
start_word = (
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请)'
r'|^第一卷'
r'|^投标邀请书'
r'|^投标邀请函'
r'|^投标邀请'
r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$'
)
end_pattern = (
2024-11-13 17:24:17 +08:00
r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$|'
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$'
2024-11-11 17:12:38 +08:00
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, start_word, end_pattern)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# # 读取 JSON 文件
# with open(json_file_path, 'r', encoding='utf-8') as file:
# data = json.load(file)
# processed_data = {}
# for key, value in data.items():
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# # 分割标题和正文
# title, content = value.split('\n', 1)
#
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
# processed_data[key] = title.strip()
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
#
# processed_data[sub_key] = title.strip()
#
# # 初始化计数器
# sub_count = 1
#
# # 根据子序号 '1.' 或 '1、' 进行分割
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
#
# current_sub_content = ""
# for i in range(1, len(sub_sections), 2):
# sub_number = sub_sections[i].strip() # 获取子序号
# sub_content = sub_sections[i + 1].strip() # 获取内容
#
# # 生成三级标题,如 '5.0.1', '5.0.2'
# sub_key_with_number = f"{sub_key}.{sub_count}"
# processed_data[sub_key_with_number] = sub_content
# sub_count += 1
#
# else:
# # 如果没有分割需求,保留原数据
# processed_data[key] = value
# # 将修改后的数据重新写入到原来的 JSON 文件中
# with open(json_file_path, 'w', encoding='utf-8') as file:
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
2024-11-13 17:24:17 +08:00
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
2024-12-24 17:32:00 +08:00
file_path=r'C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\ztbfile_tobidders_notice.pdf'
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp'
2024-11-11 17:12:38 +08:00
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)