2024-11-11 17:12:38 +08:00
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf
|
|
|
|
|
|
|
|
|
|
def convert_clause_to_json(file_path, output_folder, type=1):
|
|
|
|
|
if not os.path.exists(file_path):
|
|
|
|
|
return ""
|
|
|
|
|
if type == 1:
|
|
|
|
|
# 对于 type=1,使用原始字符串定义 start_word 和 end_pattern
|
|
|
|
|
start_word = (
|
|
|
|
|
r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
|
|
|
|
|
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
|
|
|
|
|
)
|
|
|
|
|
end_pattern = (
|
2024-11-13 17:24:17 +08:00
|
|
|
|
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
|
|
|
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
2024-11-11 17:12:38 +08:00
|
|
|
|
r'评标办法前附表|'
|
|
|
|
|
r'附录(?:一)?[::]|'
|
|
|
|
|
r'附件(?:一)?[::]|'
|
2024-11-13 17:24:17 +08:00
|
|
|
|
r'附表(?:一)?[::]'
|
2024-11-11 17:12:38 +08:00
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
# 对于其他类型,使用原始字符串定义 start_word 和 end_pattern
|
|
|
|
|
start_word = (
|
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请)'
|
|
|
|
|
r'|^第一卷'
|
|
|
|
|
r'|^投标邀请书'
|
|
|
|
|
r'|^投标邀请函'
|
|
|
|
|
r'|^投标邀请'
|
|
|
|
|
r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$'
|
|
|
|
|
)
|
|
|
|
|
end_pattern = (
|
2024-11-13 17:24:17 +08:00
|
|
|
|
r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$|'
|
|
|
|
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$'
|
2024-11-11 17:12:38 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if file_path.endswith('.pdf'):
|
|
|
|
|
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("Unsupported file format")
|
|
|
|
|
parsed_data = parse_text_by_heading(text)
|
|
|
|
|
|
|
|
|
|
# 检查输出文件夹是否存在,如果不存在则创建
|
|
|
|
|
if not os.path.exists(output_folder):
|
|
|
|
|
os.makedirs(output_folder)
|
|
|
|
|
print(f"Created output folder: {output_folder}")
|
|
|
|
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
|
|
|
|
output_path = os.path.join(output_folder, file_name)
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
|
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
|
|
|
|
|
# # 读取 JSON 文件
|
|
|
|
|
# with open(json_file_path, 'r', encoding='utf-8') as file:
|
|
|
|
|
# data = json.load(file)
|
|
|
|
|
# processed_data = {}
|
|
|
|
|
# for key, value in data.items():
|
|
|
|
|
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
|
|
|
|
|
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
|
|
|
|
|
# # 分割标题和正文
|
|
|
|
|
# title, content = value.split('\n', 1)
|
|
|
|
|
#
|
|
|
|
|
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title)
|
|
|
|
|
# processed_data[key] = title.strip()
|
|
|
|
|
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
|
|
|
|
|
#
|
|
|
|
|
# processed_data[sub_key] = title.strip()
|
|
|
|
|
#
|
|
|
|
|
# # 初始化计数器
|
|
|
|
|
# sub_count = 1
|
|
|
|
|
#
|
|
|
|
|
# # 根据子序号 '1.' 或 '1、' 进行分割
|
|
|
|
|
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
|
|
|
|
|
#
|
|
|
|
|
# current_sub_content = ""
|
|
|
|
|
# for i in range(1, len(sub_sections), 2):
|
|
|
|
|
# sub_number = sub_sections[i].strip() # 获取子序号
|
|
|
|
|
# sub_content = sub_sections[i + 1].strip() # 获取内容
|
|
|
|
|
#
|
|
|
|
|
# # 生成三级标题,如 '5.0.1', '5.0.2'
|
|
|
|
|
# sub_key_with_number = f"{sub_key}.{sub_count}"
|
|
|
|
|
# processed_data[sub_key_with_number] = sub_content
|
|
|
|
|
# sub_count += 1
|
|
|
|
|
#
|
|
|
|
|
# else:
|
|
|
|
|
# # 如果没有分割需求,保留原数据
|
|
|
|
|
# processed_data[key] = value
|
|
|
|
|
|
|
|
|
|
# # 将修改后的数据重新写入到原来的 JSON 文件中
|
|
|
|
|
# with open(json_file_path, 'w', encoding='utf-8') as file:
|
|
|
|
|
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-11-13 17:24:17 +08:00
|
|
|
|
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
|
|
|
|
|
file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf'
|
|
|
|
|
output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
|
2024-11-11 17:12:38 +08:00
|
|
|
|
try:
|
|
|
|
|
output_path = convert_clause_to_json(file_path,output_folder)
|
|
|
|
|
print(f"Final JSON result saved to: {output_path}")
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
print("Error:", e)
|