import json import os from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf def convert_clause_to_json(file_path, output_folder, type=1): if not os.path.exists(file_path): return "" if type == 1: # 对于 type=1,使用原始字符串定义 start_word 和 end_pattern start_word = ( r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)' r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)' ) end_pattern = ( # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|' r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|' r'评标办法前附表|' r'附录(?:一)?[::]|' r'附件(?:一)?[::]|' r'附表(?:一)?[::]' ) else: # 对于其他类型,使用原始字符串定义 start_word 和 end_pattern start_word = ( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请)' r'|^第一卷' r'|^投标邀请书' r'|^投标邀请函' r'|^投标邀请' r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$' ) end_pattern = ( r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$|' r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$' ) if file_path.endswith('.pdf'): text = extract_text_from_pdf(file_path, start_word, end_pattern) else: raise ValueError("Unsupported file format") parsed_data = parse_text_by_heading(text) # 检查输出文件夹是否存在,如果不存在则创建 if not os.path.exists(output_folder): os.makedirs(output_folder) print(f"Created output folder: {output_folder}") file_name = "clause1.json" if type == 1 else "clause2.json" output_path = os.path.join(output_folder, file_name) with open(output_path, 'w', encoding='utf-8') as f: json.dump(parsed_data, f, indent=4, ensure_ascii=False) return output_path # def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20 # # 读取 JSON 文件 # with open(json_file_path, 'r', encoding='utf-8') as file: # data = json.load(file) # processed_data = {} # for key, value in data.items(): # # 检查是否是一级标题(如 '5.'),并且其值包含 '\n' # if re.match(r'^\d+\.\s*$', key) and '\n' in value: # # 分割标题和正文 # title, content = value.split('\n', 1) # # # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title) # processed_data[key] = title.strip() # sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在 # # processed_data[sub_key] = title.strip() # # # 初始化计数器 # sub_count = 1 # # # 根据子序号 '1.' 或 '1、' 进行分割 # sub_sections = re.split(r'(\d+[\.\、])\s*', content) # # current_sub_content = "" # for i in range(1, len(sub_sections), 2): # sub_number = sub_sections[i].strip() # 获取子序号 # sub_content = sub_sections[i + 1].strip() # 获取内容 # # # 生成三级标题,如 '5.0.1', '5.0.2' # sub_key_with_number = f"{sub_key}.{sub_count}" # processed_data[sub_key_with_number] = sub_content # sub_count += 1 # # else: # # 如果没有分割需求,保留原数据 # processed_data[key] = value # # 将修改后的数据重新写入到原来的 JSON 文件中 # with open(json_file_path, 'w', encoding='utf-8') as file: # json.dump(processed_data, file, ensure_ascii=False, indent=4) if __name__ == "__main__": # file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf' file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf' output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp' try: output_path = convert_clause_to_json(file_path,output_folder) print(f"Final JSON result saved to: {output_path}") except ValueError as e: print("Error:", e)