zbparse/flask_app/general/投标人须知正文条款提取成json文件.py

import re

from PyPDF2 import PdfReader

from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
def compare_headings(current, new):
    # 使用过滤来确保只处理非空且为数字的部分
    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
    new_nums = [int(num) for num in new.split('.') if num.isdigit()]

    # 比较数字序列以确定标题的层次关系
    for c, n in zip(current_nums, new_nums):
        if n > c:
            return True
        elif n < c:
            return False

    # 如果新标题有更多层次，认为是新的子章节
    return len(new_nums) > len(current_nums)


def should_add_newline(content, keywords, max_length=20):
    content_str = ''.join(content).strip()
    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length

def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
    if append_newline:
        if should_add_newline(current_content, keywords):
            current_content.append('\n')  # 添加换行符
        append_newline = False
    current_content.append(line_content)
    if in_special_section:
        current_content.append('\n')
    return append_newline


import re

def parse_text_by_heading(text):
    keywords = ['包含', '以下']
    data = {}
    current_key = None
    current_key_chinese = None
    current_value_chinese = None
    current_content = []
    append_newline = False
    skip_subheadings = False
    last_main_number = None
    temp_title = None  # 临时存储以点号开头但不带数字的标题

    # 定义所有需要的正则表达式模式
    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')   # 一、
    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*') # （一）
    pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$')  # 初始扫描时匹配 A内容 或 A. 内容
    pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$')  # 主循环中严格匹配 A. 内容
    pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$')    # 1、内容

    initial_heading_pattern = None
    special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括：','雷同认定','包括以下内容']  # 定义特殊章节关键词
    in_special_section = False  # 标志是否在特殊章节中
    lines = text.split('\n')

    def check_year_pattern(line):
        # 检查是否为年份模式，如 '2014年'
        line_no_spaces = line.replace(' ', '')
        return re.match(r'^\d{4}\s*年', line_no_spaces) is not None

    def is_heading(line):
        # 检查是否匹配任何标题模式
        patterns = [
            r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)',  # 匹配 '12.1 内容'
            r'^(\d+\.)\s*(.+)$',                         # 匹配 '12. 内容'
            r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$',          # 匹配 '.12.1 内容'
            r'^(\d+)([^.\d].*)'                          # 匹配 '27 内容'
        ]
        for pattern in patterns:
            if re.match(pattern, line):
                return True
        return False

    # 预先扫描前5行，检查是否有匹配任何标题模式
    first_five_lines = lines[:5]
    has_initial_heading_patterns = False
    for line in first_five_lines:
        line_stripped = line.strip().replace('．', '.')
        if (pattern_numbered.match(line_stripped) or
            pattern_parentheses.match(line_stripped) or
            pattern_letter_initial.match(line_stripped) or
            pattern_arabic.match(line_stripped)):
            has_initial_heading_patterns = True
            break

    for i, line in enumerate(lines):
        line_stripped = line.strip().replace('．', '.')

        # 如果在一个章节内，跳过年份模式的行
        if check_year_pattern(line_stripped) and current_key is not None:
            current_content.append(line_stripped)
            continue

        # 匹配带数字的标题，例如 '12.1 内容'
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)

        # 检查是否进入或退出特殊章节
        if is_heading(line_stripped):
            if any(keyword in line_stripped for keyword in special_section_keywords):
                in_special_section = True
            elif in_special_section:
                in_special_section = False

        # 以下是原有的匹配逻辑
        # 匹配以点号开头并带有数字的情况，例如 '.12.1 内容'
        dot_match = re.match(r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$', line_stripped)

        # 匹配以点号开头但不带数字的情况，例如 '.投标文件的制作和签署'
        dot_text_match = re.match(r'^[．.]\s*(\D.+)$', line_stripped)

        # 匹配不带点号的纯数字开头的情况，例如 '27xxxxx'
        pure_number_match = re.match(r'^(\d+)([^.\d)(）].*)', line_stripped)  # 不允许出现右括号

        if match:
            new_key, line_content = match.groups()
            line_content = line_content.lstrip('.．、,').replace(' ', '')

            if any(keyword in line_content for keyword in keywords):
                skip_subheadings = True
            else:
                skip_subheadings = False

            # 保存中文标题的内容
            if current_key_chinese is not None:
                data[current_key_chinese] = current_value_chinese
                current_key_chinese = None

            # 处理临时标题与新主标题的关联
            if temp_title:
                # 提取主编号（第一个点前的数字）
                main_number = new_key.split('.')[0]
                # 关联临时标题到主编号
                data[f"{main_number}."] = temp_title
                temp_title = None  # 重置临时标题

            # 保存阿拉伯数字的标题内容
            if current_key is None or (compare_headings(current_key, new_key) and (
                    len(current_content) == 0 or current_content[-1][-1] != '第')):
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key
                current_content = [line_content]
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                last_main_number = new_key.split('.')[0]
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        elif dot_match:
            # 处理以点号开头并带有数字的情况
            sub_number, line_content = dot_match.groups()
            sub_number = sub_number.replace('．', '.')
            if last_main_number:
                new_key = f"{last_main_number}.{sub_number}"
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key
                current_content = [line_content]
                append_newline = True
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        elif dot_text_match:
            # 处理以点号开头但不带数字的情况，存储到临时变量
            temp_title = dot_text_match.group(1).strip()
            continue  # 跳过进一步处理该行

        elif pure_number_match:
            # 处理不带点号的纯数字开头的情况
            new_key_candidate, line_content = pure_number_match.groups()
            new_key_candidate += '.'  # 添加点号
            line_content = line_content.lstrip('.．、,').replace(' ', '')

            # 提取当前键名和新键名的数值部分
            def extract_number(key):
                return float(key.rstrip('.').split('.')[0])

            current_key_num = extract_number(current_key) if current_key else 0
            new_key_num = extract_number(new_key_candidate)

            # 判断新键名是否为新的标题，条件是键名递增且差值在5以内
            if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
                # 处理临时标题与新主标题的关联
                if temp_title:
                    main_number = new_key_candidate.split('.')[0]
                    data[f"{main_number}."] = temp_title
                    temp_title = None  # 重置临时标题

                # 开始新的标题
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key_candidate
                current_content = [line_content]
                append_newline = True
                last_main_number = new_key_candidate.rstrip('.')
            else:
                # 将当前行视为当前标题的内容
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        else:
            # 根据预先设置的标志决定是否执行这部分代码
            if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
                numbered_match = pattern_numbered.match(line_stripped)
                parentheses_match = pattern_parentheses.match(line_stripped)
                letter_match = pattern_letter.match(line_stripped)
                arabic_match = pattern_arabic.match(line_stripped)

                # 判断当前行是否匹配了任何标题模式
                if numbered_match or parentheses_match or letter_match or arabic_match:
                    # 如果初始标题模式尚未设置，则记录当前匹配的标题模式
                    if initial_heading_pattern is None:
                        if numbered_match:
                            initial_heading_pattern = 'numbered'
                        elif parentheses_match:
                            initial_heading_pattern = 'parentheses'
                        elif letter_match:
                            initial_heading_pattern = 'letter'
                        elif arabic_match:
                            initial_heading_pattern = 'arabic'

                    # 确定当前匹配的标题模式
                    if numbered_match:
                        current_heading_pattern = 'numbered'
                    elif parentheses_match:
                        current_heading_pattern = 'parentheses'
                    elif letter_match:
                        current_heading_pattern = 'letter'
                    elif arabic_match:
                        current_heading_pattern = 'arabic'

                    # 如果当前标题模式与初始标题模式一致，创建新的键值对
                    if current_heading_pattern == initial_heading_pattern:
                        # 保存之前的 key 的内容
                        if current_key is not None:
                            content_string = ''.join(current_content).strip()
                            data[current_key] = data.get(current_key, '') + content_string
                        if current_key_chinese is not None:
                            data[current_key_chinese] = current_value_chinese
                            current_key_chinese = None

                        # 处理匹配到的标题
                        if current_heading_pattern == 'numbered':
                            current_key_chinese = numbered_match.group(1)
                            current_value_chinese = line_stripped[numbered_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'parentheses':
                            current_key_chinese = parentheses_match.group(1)
                            current_value_chinese = line_stripped[parentheses_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'letter':
                            # 字母标题处理
                            letter_key, letter_value = letter_match.groups()
                            letter_to_chinese = {
                                'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
                                'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
                                'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
                            }
                            current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
                            current_value_chinese = letter_value.lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'arabic':
                            arabic_key, arabic_value = arabic_match.groups()
                            current_key = arabic_key.replace('、', '.')
                            current_content = [arabic_value.replace(' ', '')]
                            append_newline = True
                            last_main_number = current_key.rstrip('.')
                        continue
                    else:
                        # 当前标题模式与初始模式不一致，将该行视为内容
                        if line_stripped:
                            append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
                else:
                    # 未匹配到任何标题模式，将该行视为内容
                    if line_stripped:
                        append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
            else:
                # 在特殊章节中，所有内容都作为当前标题的内容
                if line_stripped:
                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

    # 最后保存最后一个 key 对应的内容
    if current_key is not None:
        content_string = ''.join(current_content).strip()
        data[current_key] = data.get(current_key, '') + content_string
    if current_key_chinese is not None:
        data[current_key_chinese] = current_value_chinese
    if temp_title:
        # 处理任何未关联的临时标题
        data["未分类标题"] = temp_title  # 您可以根据需要选择合适的键名

    return data


def extract_text_from_pdf(file_path, start_word, end_pattern):
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
    pdf_document = PdfReader(file_path)
    exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')  #仅匹配第一页和最后一页，不需要exclusion_pattern
    all_pages_text = []
    start_index = None
    # 处理所有页面
    for i, page in enumerate(pdf_document.pages):
        page_text = page.extract_text() if page.extract_text() else ""
        cleaned_text = clean_page_content(page_text, common_header)
        # print(cleaned_text)
        # 在第一页查找开始位置
        if i == 0 and start_index is None:
            start_match = re.search(start_word, cleaned_text, re.MULTILINE)
            if start_match:
                start_index = start_match.start()
                cleaned_text = cleaned_text[start_index:]

        # 在最后一页查找结束位置
        if i == len(pdf_document.pages) - 1:
            matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
            if matches:
                end_index = matches[-1].start()
                cleaned_text = cleaned_text[:end_index]

        all_pages_text.append(cleaned_text)

    # 合并所有页面的文本
    full_text = "\n".join(all_pages_text)
    return full_text