zbparse/flask_app/main/提取打勾符号.py
2024-08-29 16:37:09 +08:00

84 lines
3.3 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import PyPDF2
import json
def extract_key_value_pairs(text):
import re
# 使用正则表达式来找到通常的键值对
pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)[\x01\x02☑]([\w\s\u4e00-\u9fff]+)'
matches = re.findall(pattern, text)
results = {}
for key, value in matches:
# 移除键中的数字和点序号
key = re.sub(r'^\d+(\.\d+)*\s+', '', key)
# 移除键中的多余空格和特殊字符
cleaned_key = re.sub(r'\s+', '', key).replace('', '').replace('', '')
# 清理值并停止提取到特定标点符号为止
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
results[cleaned_key] = cleaned_value
# 处理 '' 或 '☑' 位于行首的特殊情况
lines = text.split('\n')
previous_lines = []
last_serial_key = ""
for line in lines:
if re.match(r'^[\x01\x02☑√]', line):
# 提取当前行的值,直到特定标点符号
value = re.sub(r'^[\x01\x02☑√]\s*', '', line)
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
# 使用最后一个包含序号的行作为键名的候选
if last_serial_key:
# 从最后一个有效序号行提取键名,并在特殊字符前停止
key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key)
key = re.split(r'[\x01\x02□]', key)[0].strip() # 停止提取到特殊字符为止
# 处理重复键名的情况
original_key = key
count = 1
while key in results:
key = f"{original_key}{' ' * count}"
count += 1
results[key] = cleaned_value
else:
# 保留最近的几行作为键的候选行
if re.search(r'\d+\.\d+', line):
last_serial_key = line # 更新最后一个包含序号的行作为键名的候选
previous_lines.append(line)
if len(previous_lines) > 10:
previous_lines.pop(0)
return results
def read_pdf_and_judge_main(file_path, output_json_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
print(f"Total pages: {num_pages}")
all_data = {}
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() if page.extract_text() else ""
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
cleaned_text = re.sub(r'^\d+\s+', '', text)
key_value_pairs = extract_key_value_pairs( cleaned_text)
all_data.update(key_value_pairs)
# 有区别吗  
# 将所有数据保存为 JSON 文件
with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
print(f"Data extraction complete and saved to '{output_json_path}'.")
if __name__ == "__main__":
# 示例调用
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_tobidders_notice_table.pdf'
output_json_path = 'judge_exist.json'
read_pdf_and_judge_main(file_path, output_json_path)