2024-08-29 16:37:09 +08:00
|
|
|
|
import re
|
|
|
|
|
import PyPDF2
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def extract_key_value_pairs(text):
|
2024-09-03 11:42:43 +08:00
|
|
|
|
# 更新正则表达式来包括对"团"的处理和行尾斜线
|
|
|
|
|
pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
|
2024-08-29 16:37:09 +08:00
|
|
|
|
matches = re.findall(pattern, text)
|
|
|
|
|
results = {}
|
|
|
|
|
|
|
|
|
|
for key, value in matches:
|
|
|
|
|
key = re.sub(r'^\d+(\.\d+)*\s+', '', key)
|
|
|
|
|
cleaned_key = re.sub(r'\s+', '', key).replace('(', '').replace(')', '')
|
|
|
|
|
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
|
|
|
|
|
results[cleaned_key] = cleaned_value
|
|
|
|
|
|
2024-09-03 11:42:43 +08:00
|
|
|
|
# 检测行尾斜线并处理
|
|
|
|
|
line_end_slash_pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)\/$'
|
|
|
|
|
slash_matches = re.findall(line_end_slash_pattern, text)
|
|
|
|
|
for key in slash_matches:
|
|
|
|
|
cleaned_key = re.sub(r'^\d+(\.\d+)*\s+', '', key).strip()
|
|
|
|
|
cleaned_key = re.sub(r'\s+', '', cleaned_key).replace('(', '').replace(')', '')
|
|
|
|
|
results[cleaned_key] = "无" # 为行尾斜线的键分配值"无"
|
|
|
|
|
|
|
|
|
|
# 现有代码:处理'', '☑', '团'位于行首的情况
|
2024-08-29 16:37:09 +08:00
|
|
|
|
lines = text.split('\n')
|
|
|
|
|
last_serial_key = ""
|
|
|
|
|
|
|
|
|
|
for line in lines:
|
2024-09-03 11:42:43 +08:00
|
|
|
|
if re.match(r'^[\x01\x02☑√团]', line):
|
|
|
|
|
value = re.sub(r'^[\x01\x02☑√团]\s*', '', line)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
|
|
|
|
|
if last_serial_key:
|
|
|
|
|
key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key)
|
2024-09-03 11:42:43 +08:00
|
|
|
|
key = re.split(r'[\x01\x02☑□团]', key)[0].strip()
|
2024-08-29 16:37:09 +08:00
|
|
|
|
original_key = key
|
|
|
|
|
count = 1
|
|
|
|
|
while key in results:
|
|
|
|
|
key = f"{original_key}{' ' * count}"
|
|
|
|
|
count += 1
|
|
|
|
|
results[key] = cleaned_value
|
|
|
|
|
else:
|
2024-09-03 11:42:43 +08:00
|
|
|
|
line = line.strip() # 预先去除行首尾的空格,简化处理逻辑
|
2024-08-29 16:37:09 +08:00
|
|
|
|
if re.search(r'\d+\.\d+', line):
|
2024-09-03 11:42:43 +08:00
|
|
|
|
last_serial_key = line # 记录最后一个包含序号的行
|
|
|
|
|
if line.endswith('/'):
|
|
|
|
|
pattern = r'^((?:\d+(?:\.\d+)*\.?\s*)?)(.+?)\s*/\s*$'
|
|
|
|
|
match = re.match(pattern, line)
|
|
|
|
|
if match:
|
|
|
|
|
# 获取键名,并去除多余的空格
|
|
|
|
|
key = match.group(2).strip().replace(' ','')
|
|
|
|
|
# 设置值为"无"
|
|
|
|
|
results[key] = "无" # 为键赋值"无"
|
2024-08-29 16:37:09 +08:00
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def read_pdf_and_judge_main(file_path, output_json_path):
|
|
|
|
|
with open(file_path, 'rb') as file:
|
|
|
|
|
reader = PyPDF2.PdfReader(file)
|
|
|
|
|
num_pages = len(reader.pages)
|
|
|
|
|
print(f"Total pages: {num_pages}")
|
|
|
|
|
|
|
|
|
|
all_data = {}
|
|
|
|
|
for page_num in range(num_pages):
|
|
|
|
|
page = reader.pages[page_num]
|
|
|
|
|
text = page.extract_text() if page.extract_text() else ""
|
|
|
|
|
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
|
|
|
|
|
cleaned_text = re.sub(r'^\d+\s+', '', text)
|
2024-09-03 11:42:43 +08:00
|
|
|
|
print(cleaned_text)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
2024-09-03 11:42:43 +08:00
|
|
|
|
key_value_pairs = extract_key_value_pairs(cleaned_text)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
all_data.update(key_value_pairs)
|
|
|
|
|
# 和有区别吗
|
|
|
|
|
# 将所有数据保存为 JSON 文件
|
|
|
|
|
with open(output_json_path, "w", encoding="utf-8") as json_file:
|
|
|
|
|
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
|
|
|
|
|
|
|
|
|
|
print(f"Data extraction complete and saved to '{output_json_path}'.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 示例调用
|
2024-09-03 11:42:43 +08:00
|
|
|
|
file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\a95a9a0a-f2dd-4007-b849-6b0a1a4c2b91\\ztbfile_tobidders_notice_table.pdf"
|
|
|
|
|
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
|
2024-08-29 16:37:09 +08:00
|
|
|
|
output_json_path = 'judge_exist.json'
|
|
|
|
|
read_pdf_and_judge_main(file_path, output_json_path)
|