import re import PyPDF2 import json from flask import g def extract_key_value_pairs(text): # 更新正则表达式来包括对"团"的处理和行尾斜线 pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)' matches = re.findall(pattern, text) results = {} for key, value in matches: key = re.sub(r'^\d+(\.\d+)*\s+', '', key) cleaned_key = re.sub(r'\s+', '', key).replace('(', '').replace(')', '') cleaned_value = re.split(r'[,。、,]', value)[0].strip() results[cleaned_key] = cleaned_value # 检测行尾斜线并处理 line_end_slash_pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)\/$' slash_matches = re.findall(line_end_slash_pattern, text) for key in slash_matches: cleaned_key = re.sub(r'^\d+(\.\d+)*\s+', '', key).strip() cleaned_key = re.sub(r'\s+', '', cleaned_key).replace('(', '').replace(')', '') results[cleaned_key] = "无" # 为行尾斜线的键分配值"无" # 现有代码:处理'', '☑', '团'位于行首的情况 lines = text.split('\n') last_serial_key = "" for line in lines: if re.match(r'^[\x01\x02☑√团]', line): value = re.sub(r'^[\x01\x02☑√团]\s*', '', line) cleaned_value = re.split(r'[,。、,]', value)[0].strip() if last_serial_key: key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key) key = re.split(r'[\x01\x02☑□团]', key)[0].strip() original_key = key count = 1 while key in results: key = f"{original_key}{' ' * count}" count += 1 results[key] = cleaned_value else: line = line.strip() # 预先去除行首尾的空格,简化处理逻辑 if re.search(r'\d+\.\d+', line): last_serial_key = line # 记录最后一个包含序号的行 if line.endswith('/'): pattern = r'^((?:\d+(?:\.\d+)*\.?\s*)?)(.+?)\s*/\s*$' match = re.match(pattern, line) if match: # 获取键名,并去除多余的空格 key = match.group(2).strip().replace(' ','') # 设置值为"无" results[key] = "无" # 为键赋值"无" return results def read_pdf_and_judge_main(file_path, output_json_path): with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) all_data = {} for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() if page.extract_text() else "" # 使用正则表达式删除每页开头的页码,格式通常为“数字+空格” cleaned_text = re.sub(r'^\d+\s+', '', text) # print(cleaned_text) key_value_pairs = extract_key_value_pairs(cleaned_text) all_data.update(key_value_pairs) # 和有区别吗   # 将所有数据保存为 JSON 文件 with open(output_json_path, "w", encoding="utf-8") as json_file: json.dump(all_data, json_file, ensure_ascii=False, indent=4) g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.") if __name__ == "__main__": # 示例调用 file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\a95a9a0a-f2dd-4007-b849-6b0a1a4c2b91\\ztbfile_tobidders_notice_table.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output\ztb_20240903100337\\ztb_20240903100337_14-20.pdf" output_json_path = 'judge_exist.json' read_pdf_and_judge_main(file_path, output_json_path)