zbparse/flask_app/main/提取打勾符号.py

86 lines
3.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import PyPDF2
import json
def extract_key_value_pairs(text):
# 更新正则表达式来包括对"团"的处理和行尾斜线
pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
matches = re.findall(pattern, text)
results = {}
for key, value in matches:
key = re.sub(r'^\d+(\.\d+)*\s+', '', key)
cleaned_key = re.sub(r'\s+', '', key).replace('', '').replace('', '')
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
results[cleaned_key] = cleaned_value
# 检测行尾斜线并处理
line_end_slash_pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)\/$'
slash_matches = re.findall(line_end_slash_pattern, text)
for key in slash_matches:
cleaned_key = re.sub(r'^\d+(\.\d+)*\s+', '', key).strip()
cleaned_key = re.sub(r'\s+', '', cleaned_key).replace('', '').replace('', '')
results[cleaned_key] = "" # 为行尾斜线的键分配值"无"
# 现有代码:处理'', '☑', '团'位于行首的情况
lines = text.split('\n')
last_serial_key = ""
for line in lines:
if re.match(r'^[\x01\x02☑√团]', line):
value = re.sub(r'^[\x01\x02☑√团]\s*', '', line)
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
if last_serial_key:
key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key)
key = re.split(r'[\x01\x02☑□团]', key)[0].strip()
original_key = key
count = 1
while key in results:
key = f"{original_key}{' ' * count}"
count += 1
results[key] = cleaned_value
else:
line = line.strip() # 预先去除行首尾的空格,简化处理逻辑
if re.search(r'\d+\.\d+', line):
last_serial_key = line # 记录最后一个包含序号的行
if line.endswith('/'):
pattern = r'^((?:\d+(?:\.\d+)*\.?\s*)?)(.+?)\s*/\s*$'
match = re.match(pattern, line)
if match:
# 获取键名,并去除多余的空格
key = match.group(2).strip().replace(' ','')
# 设置值为"无"
results[key] = "" # 为键赋值"无"
return results
def read_pdf_and_judge_main(file_path, output_json_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
print(f"Total pages: {num_pages}")
all_data = {}
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() if page.extract_text() else ""
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
cleaned_text = re.sub(r'^\d+\s+', '', text)
print(cleaned_text)
key_value_pairs = extract_key_value_pairs(cleaned_text)
all_data.update(key_value_pairs)
# 有区别吗  
# 将所有数据保存为 JSON 文件
with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
print(f"Data extraction complete and saved to '{output_json_path}'.")
if __name__ == "__main__":
# 示例调用
file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\a95a9a0a-f2dd-4007-b849-6b0a1a4c2b91\\ztbfile_tobidders_notice_table.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
output_json_path = 'judge_exist.json'
read_pdf_and_judge_main(file_path, output_json_path)