84 lines
3.3 KiB
Python
84 lines
3.3 KiB
Python
import re
|
||
import PyPDF2
|
||
import json
|
||
|
||
def extract_key_value_pairs(text):
|
||
import re
|
||
# 使用正则表达式来找到通常的键值对
|
||
pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑]([\w\s\u4e00-\u9fff]+)'
|
||
matches = re.findall(pattern, text)
|
||
results = {}
|
||
|
||
for key, value in matches:
|
||
# 移除键中的数字和点序号
|
||
key = re.sub(r'^\d+(\.\d+)*\s+', '', key)
|
||
# 移除键中的多余空格和特殊字符
|
||
cleaned_key = re.sub(r'\s+', '', key).replace('(', '').replace(')', '')
|
||
# 清理值并停止提取到特定标点符号为止
|
||
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
|
||
results[cleaned_key] = cleaned_value
|
||
|
||
# 处理 '' 或 '☑' 位于行首的特殊情况
|
||
lines = text.split('\n')
|
||
previous_lines = []
|
||
last_serial_key = ""
|
||
|
||
for line in lines:
|
||
if re.match(r'^[\x01\x02☑√]', line):
|
||
# 提取当前行的值,直到特定标点符号
|
||
value = re.sub(r'^[\x01\x02☑√]\s*', '', line)
|
||
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
|
||
|
||
# 使用最后一个包含序号的行作为键名的候选
|
||
if last_serial_key:
|
||
# 从最后一个有效序号行提取键名,并在特殊字符前停止
|
||
key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key)
|
||
key = re.split(r'[\x01\x02□]', key)[0].strip() # 停止提取到特殊字符为止
|
||
# 处理重复键名的情况
|
||
original_key = key
|
||
count = 1
|
||
while key in results:
|
||
key = f"{original_key}{' ' * count}"
|
||
count += 1
|
||
results[key] = cleaned_value
|
||
else:
|
||
# 保留最近的几行作为键的候选行
|
||
if re.search(r'\d+\.\d+', line):
|
||
last_serial_key = line # 更新最后一个包含序号的行作为键名的候选
|
||
previous_lines.append(line)
|
||
if len(previous_lines) > 10:
|
||
previous_lines.pop(0)
|
||
|
||
return results
|
||
|
||
|
||
|
||
def read_pdf_and_judge_main(file_path, output_json_path):
|
||
with open(file_path, 'rb') as file:
|
||
reader = PyPDF2.PdfReader(file)
|
||
num_pages = len(reader.pages)
|
||
print(f"Total pages: {num_pages}")
|
||
|
||
all_data = {}
|
||
for page_num in range(num_pages):
|
||
page = reader.pages[page_num]
|
||
text = page.extract_text() if page.extract_text() else ""
|
||
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
|
||
cleaned_text = re.sub(r'^\d+\s+', '', text)
|
||
|
||
key_value_pairs = extract_key_value_pairs( cleaned_text)
|
||
all_data.update(key_value_pairs)
|
||
# 和有区别吗
|
||
# 将所有数据保存为 JSON 文件
|
||
with open(output_json_path, "w", encoding="utf-8") as json_file:
|
||
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
|
||
|
||
print(f"Data extraction complete and saved to '{output_json_path}'.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 示例调用
|
||
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_tobidders_notice_table.pdf'
|
||
output_json_path = 'judge_exist.json'
|
||
read_pdf_and_judge_main(file_path, output_json_path)
|