import re import PyPDF2 import json def extract_key_value_pairs(text): import re # 使用正则表达式来找到通常的键值对 pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑]([\w\s\u4e00-\u9fff]+)' matches = re.findall(pattern, text) results = {} for key, value in matches: # 移除键中的数字和点序号 key = re.sub(r'^\d+(\.\d+)*\s+', '', key) # 移除键中的多余空格和特殊字符 cleaned_key = re.sub(r'\s+', '', key).replace('(', '').replace(')', '') # 清理值并停止提取到特定标点符号为止 cleaned_value = re.split(r'[,。、,]', value)[0].strip() results[cleaned_key] = cleaned_value # 处理 '' 或 '☑' 位于行首的特殊情况 lines = text.split('\n') previous_lines = [] last_serial_key = "" for line in lines: if re.match(r'^[\x01\x02☑√]', line): # 提取当前行的值,直到特定标点符号 value = re.sub(r'^[\x01\x02☑√]\s*', '', line) cleaned_value = re.split(r'[,。、,]', value)[0].strip() # 使用最后一个包含序号的行作为键名的候选 if last_serial_key: # 从最后一个有效序号行提取键名,并在特殊字符前停止 key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key) key = re.split(r'[\x01\x02□]', key)[0].strip() # 停止提取到特殊字符为止 # 处理重复键名的情况 original_key = key count = 1 while key in results: key = f"{original_key}{' ' * count}" count += 1 results[key] = cleaned_value else: # 保留最近的几行作为键的候选行 if re.search(r'\d+\.\d+', line): last_serial_key = line # 更新最后一个包含序号的行作为键名的候选 previous_lines.append(line) if len(previous_lines) > 10: previous_lines.pop(0) return results def read_pdf_and_judge_main(file_path, output_json_path): with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) print(f"Total pages: {num_pages}") all_data = {} for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() if page.extract_text() else "" # 使用正则表达式删除每页开头的页码,格式通常为“数字+空格” cleaned_text = re.sub(r'^\d+\s+', '', text) key_value_pairs = extract_key_value_pairs( cleaned_text) all_data.update(key_value_pairs) # 和有区别吗   # 将所有数据保存为 JSON 文件 with open(output_json_path, "w", encoding="utf-8") as json_file: json.dump(all_data, json_file, ensure_ascii=False, indent=4) print(f"Data extraction complete and saved to '{output_json_path}'.") if __name__ == "__main__": # 示例调用 file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_tobidders_notice_table.pdf' output_json_path = 'judge_exist.json' read_pdf_and_judge_main(file_path, output_json_path)