zbparse/flask_app/main/提取打勾符号.py

import re
import PyPDF2
import json

def extract_key_value_pairs(text):
    # 更新正则表达式来包括对"团"的处理和行尾斜线
    pattern = r'\d+\.\d+\s*([\w\s（）\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
    matches = re.findall(pattern, text)
    results = {}

    for key, value in matches:
        key = re.sub(r'^\d+(\.\d+)*\s+', '', key)
        cleaned_key = re.sub(r'\s+', '', key).replace('（', '').replace('）', '')
        cleaned_value = re.split(r'[，。、,]', value)[0].strip()
        results[cleaned_key] = cleaned_value

    # 检测行尾斜线并处理
    line_end_slash_pattern = r'\d+\.\d+\s*([\w\s（）\u4e00-\u9fff]+)\/$'
    slash_matches = re.findall(line_end_slash_pattern, text)
    for key in slash_matches:
        cleaned_key = re.sub(r'^\d+(\.\d+)*\s+', '', key).strip()
        cleaned_key = re.sub(r'\s+', '', cleaned_key).replace('（', '').replace('）', '')
        results[cleaned_key] = "无"  # 为行尾斜线的键分配值"无"

    # 现有代码：处理'', '☑', '团'位于行首的情况
    lines = text.split('\n')
    last_serial_key = ""

    for line in lines:
        if re.match(r'^[\x01\x02☑√团]', line):
            value = re.sub(r'^[\x01\x02☑√团]\s*', '', line)
            cleaned_value = re.split(r'[，。、,]', value)[0].strip()
            if last_serial_key:
                key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key)
                key = re.split(r'[\x01\x02☑□团]', key)[0].strip()
                original_key = key
                count = 1
                while key in results:
                    key = f"{original_key}{' ' * count}"
                    count += 1
                results[key] = cleaned_value
        else:
            line = line.strip()  # 预先去除行首尾的空格，简化处理逻辑
            if re.search(r'\d+\.\d+', line):
                last_serial_key = line  # 记录最后一个包含序号的行
            if line.endswith('/'):
                pattern = r'^((?:\d+(?:\.\d+)*\.?\s*)?)(.+?)\s*/\s*$'
                match = re.match(pattern, line)
                if match:
                    # 获取键名，并去除多余的空格
                    key = match.group(2).strip().replace(' ','')
                    # 设置值为"无"
                    results[key] = "无"  # 为键赋值"无"
    return results


def read_pdf_and_judge_main(file_path, output_txt_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        all_data = {}
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text() if page.extract_text() else ""
            # 使用正则表达式删除每页开头的页码，格式通常为“数字+空格”
            text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时
            text = re.sub(r'\s+\d+\s*$', '', text)  # 删除结尾的页码
            text = re.sub(r'\s*\/\s*\d+\s*', '', text)  # 删除形如 /129 的页码
            key_value_pairs = extract_key_value_pairs(text)
            all_data.update(key_value_pairs)

    # 将所有数据保存为 TXT 文件
    with open(output_txt_path, "w", encoding="utf-8") as txt_file:
        for idx, (key, value) in enumerate(all_data.items()):
            if idx == 0:
                line = f'"{key}": "{value}",\n'
            else:
                line = f'"{key}": "{value}",\n'
            txt_file.write(line)

    print(f"da_gou signal: Data extraction complete and saved to '{output_txt_path}'.")


if __name__ == "__main__":
    # 示例调用
    file_path ="C:\\Users\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251\\ztbfile_tobidders_notice_table.pdf"
    # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
    output_txt_path = 'C:\\Users\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251\\tmp\\judge_exist.txt'
    output_path=read_pdf_and_judge_main(file_path, output_txt_path)