zbparse/flask_app/old_version/提取json工程标版.py

105 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf
def convert_clause_to_json(file_path, output_folder, type=1):
if not os.path.exists(file_path):
return ""
if type == 1:
# 对于 type=1使用原始字符串定义 start_word 和 end_pattern
start_word = (
r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
)
end_pattern = (
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'评标办法前附表|'
r'附录(?:一)?[:]|'
r'附件(?:一)?[:]|'
r'附表(?:一)?[:]'
)
else:
# 对于其他类型,使用原始字符串定义 start_word 和 end_pattern
start_word = (
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请)'
r'|^第一卷'
r'|^投标邀请书'
r'|^投标邀请函'
r'|^投标邀请'
r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$'
)
end_pattern = (
r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$|'
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$'
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, start_word, end_pattern)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# # 读取 JSON 文件
# with open(json_file_path, 'r', encoding='utf-8') as file:
# data = json.load(file)
# processed_data = {}
# for key, value in data.items():
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# # 分割标题和正文
# title, content = value.split('\n', 1)
#
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
# processed_data[key] = title.strip()
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
#
# processed_data[sub_key] = title.strip()
#
# # 初始化计数器
# sub_count = 1
#
# # 根据子序号 '1.' 或 '1、' 进行分割
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
#
# current_sub_content = ""
# for i in range(1, len(sub_sections), 2):
# sub_number = sub_sections[i].strip() # 获取子序号
# sub_content = sub_sections[i + 1].strip() # 获取内容
#
# # 生成三级标题,如 '5.0.1', '5.0.2'
# sub_key_with_number = f"{sub_key}.{sub_count}"
# processed_data[sub_key_with_number] = sub_content
# sub_count += 1
#
# else:
# # 如果没有分割需求,保留原数据
# processed_data[key] = value
# # 将修改后的数据重新写入到原来的 JSON 文件中
# with open(json_file_path, 'w', encoding='utf-8') as file:
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
file_path=r'C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\ztbfile_tobidders_notice.pdf'
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)