zbparse/flask_app/general/投标人须知正文条款提取成json文件.py

import json
import os
import re
import regex
from PyPDF2 import PdfReader

from flask_app.general.截取pdf通用函数 import clean_page_content, extract_common_header


def compare_headings(current, new):
    """
    比较两个标题的层次关系，并确保新标题比当前标题大且最高位数字差值不超过5。
    因为默认新的序号是比旧的序号大的，同时也要防止一些特别大的序号在行首，如'2025年xxx' 错误地将'2025'匹配成序号了，事实上它只是正文的一部分。
    参数:
    current (str): 当前标题，例如 "1.2.3"
    new (str): 新标题，例如 "1.3"
    返回:
    bool: 如果新标题大于当前标题且最高位数字差值不超过3，则返回 True，否则返回 False
    """
    # 使用过滤来确保只处理非空且为数字的部分
    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
    new_nums = [int(num) for num in new.split('.') if num.isdigit()]

    # 确保新标题的最高位数字不超过当前标题的最高位数字 + 3
    if new_nums:
        if len(new_nums) > 0 and len(current_nums) > 0:
            if new_nums[0] > current_nums[0] + 3:
                return False

    # 比较数字序列以确定标题的层次关系
    for c, n in zip(current_nums, new_nums):
        if n > c:
            return True
        elif n < c:
            return False

    # 如果新标题有更多层次，认为是新的子章节
    return len(new_nums) > len(current_nums)


def should_add_newline(content, keywords, max_length=20):
    #如果拼接后的字符串 content_str 中包含任何一个在 keywords 中的关键词，或者 content_str 的长度小于或等于 max_length，则返回 True，表示应该添加换行符。
    content_str = ''.join(content).strip()
    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length


def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
    if append_newline:
        if should_add_newline(current_content, keywords):    #current_content:['费用承担']  line_content:'投标人准备和参加投标活动发生的费用自理。'
            current_content.append('\n')  # 添加换行符          #current_content:['费用承担', '\n']
        append_newline = False
    current_content.append(line_content)                        #current_content:['费用承担', '\n', '投标人准备和参加投标活动发生的费用自理。']
    if in_special_section:
        current_content.append('\n')
    return append_newline


def parse_text_by_heading(text):
    keywords = ['包含', '以下']
    data = {}
    current_key = None
    current_key_chinese = None
    current_value_chinese = None
    current_content = []
    append_newline = False
    skip_subheadings = False
    last_main_number = None
    temp_title = None  # 临时存储以点号开头但不带数字的标题

    # 定义所有需要的正则表达式模式
    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')  # 一、
    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')  # （一）
    pattern_letter_initial = re.compile(r'^([A-Z])[.．、]?\s*(.*)$')  # 初始扫描时匹配 A内容 或 A. 内容
    # pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$')    # 1、内容

    initial_heading_pattern = None
    special_section_keywords = [r'文件.*(组成|构成|包括|包含)']      #注意，只考虑对文件组成/构成换行，因为一般一行能展示的下，如“投标函”；暂不考虑'下列情形'，因为添加换行会导致语义的不连贯。
    in_special_section = False  # 标志是否在特殊章节中
    lines = text.split('\n')

    def get_current_number(key_chinese):
        chinese_to_number = {              #便于比较中文序号大小
            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
            '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
        }
        if not key_chinese:
            return -1
        return chinese_to_number.get(key_chinese, -1)

    def check_year_pattern(line):
        # 检查是否为年份模式，如 '2014年'
        line_no_spaces = line.replace(' ', '')
        return re.match(r'^\d{4}\s*年', line_no_spaces) is not None

    def is_heading(line):
        # 检查是否匹配任何标题模式
        patterns = [
            r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)',  # 匹配 '12.1 内容'
            r'^(\d+\.)\s*(.+)$',  # 匹配 '12. 内容'
            r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$',  # 匹配 '.12.1 内容'   这种点号开头的情况是因为读取pdf时clean_page_content进行了清理，删去了页眉页脚和页码，序号可能会被误判为页码删除，这种情况已在代码中进行了不错的处理。
            r'^(\d+)([^.\d].*)'  # 匹配 '27 内容'
        ]
        for pattern in patterns:
            if re.match(pattern, line):
                return True
        return False

    # 预先扫描前5行，检查是否有匹配任何标题模式 '一、总则' 这种,大标题夹着序号
    first_five_lines = lines[:5]
    has_initial_heading_patterns = False
    for line in first_five_lines:
        line_stripped = line.strip().replace('．', '.')   #line_stripped是处理中的当前行
        if line_stripped.startswith("##"):          #预处理时添加的，对每页pdf的首行前打了标记'##' ，便于特殊处理；因为首行的序号往往会因为clean_page_content被错误删除！需补全该序号。
            line_stripped = line_stripped[2:]       # 移除 "##"
        if (pattern_numbered.match(line_stripped) or pattern_parentheses.match(
                line_stripped) or pattern_letter_initial.match(line_stripped)):
            has_initial_heading_patterns = True
            break

    for i, line in enumerate(lines):
        in_double_hash_mode = False    #标志是否位于页码第一行
        line_stripped = line.strip().replace('．', '.')
        if line_stripped.startswith("##"):
            in_double_hash_mode = True
            line_stripped = line_stripped[2:]  # Remove "##"
        # 如果在一个章节内，跳过年份模式的行
        if check_year_pattern(line_stripped) and current_key is not None:
            current_content.append(line_stripped)
            continue

            # 匹配带数字的标题，例如 '12.1 内容'
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)

        # 检查是否进入或退出特殊章节： 防止如'5、 竞争性磋商采购文件的构成'后面若干行出现'一、招标公告''二、投标人须知''七、 xxxx',这种情况，导致程序错误地匹配大标题，但是它们应该视为'5、 竞争性磋商采购文件的构成'后的正文部分！
        #进入特殊章节：处理到的行都视为当前序号的内容，而不作为新的序号
        if is_heading(line_stripped):
            if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
                in_special_section = True
            elif in_special_section:
                in_special_section = False

        # 匹配以点号开头并带有数字的情况，例如 '.12.1 内容'
        dot_match = re.match(r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$', line_stripped)

        # 匹配以点号开头但不带数字的情况，例如 '.投标文件的制作和签署'
        dot_text_match = re.match(r'^[．.、]\s*(\D.+)$', line_stripped)

        # 匹配不带点号的纯数字开头的情况，例如 '27xxxxx'
        pure_number_match = re.match(r'^(\d+)([^.\d)(）号条款节章项例页段部步点年月日时分秒个元千万台份家].*)', line_stripped)  # [^...]：表示匹配不在 [...] 中的字符集合（即排除这些字符）

        if match:
            new_key, line_content = match.groups()
            line_content = line_content.lstrip('.．、,').replace(' ', '')

            if any(keyword in line_content for keyword in keywords):
                skip_subheadings = True
            else:
                skip_subheadings = False

            # 保存中文标题的内容
            if current_key_chinese is not None:
                data[current_key_chinese] = current_value_chinese
                # current_key_chinese = None

            # 处理临时标题与新主标题的关联
            if temp_title:
                # 提取主编号（第一个点前的数字）
                main_number = new_key.split('.')[0]
                # 关联临时标题到主编号
                data[f"{main_number}."] = temp_title
                temp_title = None  # 重置临时标题

            last_char = current_content[-1][-1] if current_content and current_content[-1] else None

            if current_key is None or (compare_headings(current_key, new_key) and (
                    not current_content or last_char != '第')):
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key
                current_content = [line_content]
                append_newline = (
                        len(new_key.rstrip('.').split('.')) <= 2 or
                        any(re.search(pattern, line_content) for pattern in special_section_keywords)
                )
                last_main_number = new_key.split('.')[0]
            else:
                append_newline = handle_content_append(
                    current_content, line_stripped, append_newline, keywords, in_special_section
                )

        elif dot_match:
            if in_double_hash_mode:
                # 处理以点号开头并带有数字的情况
                sub_number, line_content = dot_match.groups()
                sub_number = sub_number.replace('．', '.')
                # 检查是否进入或退出特殊章节
                if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
                    in_special_section = True
                elif in_special_section:
                    in_special_section = False
                if last_main_number:
                    new_key = f"{last_main_number}.{sub_number}"
                    if current_key is not None:
                        content_string = ''.join(current_content).strip()
                        data[current_key] = data.get(current_key, '') + content_string
                    current_key = new_key
                    current_content = [line_content]
                    append_newline = True
                else:
                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
                                                           in_special_section)
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
                                                       in_special_section)
        elif dot_text_match:
            if in_double_hash_mode:
                # 处理以点号开头但不带数字的情况，自动生成下一个一级序号
                temp_content = dot_text_match.group(1).strip()
                # 检查是否进入或退出特殊章节
                if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
                    in_special_section = True
                elif in_special_section:
                    in_special_section = False
                if last_main_number:
                    # 生成下一个一级序号
                    try:
                        # 尝试将 last_main_number 转为整数并加1
                        next_main_number = str(int(last_main_number) + 1) + '.'
                    except ValueError:
                        # 如果 last_main_number 不是纯数字，处理为中文序号或其他逻辑
                        # 这里假设 last_main_number 是阿拉伯数字
                        next_main_number = '未识别的序号.'
                else:
                    # 如果没有上一个主编号，默认从 '1.' 开始
                    next_main_number = '1.'
                # 更新 current_key 和 last_main_number
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = next_main_number
                current_content = [temp_content]  # 不直接添加 '\n'
                last_main_number = next_main_number.rstrip('.')
                append_newline = True  # 设置标志，下一次内容添加时需要 '\n'

                continue  # 跳过进一步处理该行
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
                                                       in_special_section)
        elif pure_number_match:
            # 处理不带点号的纯数字开头的情况
            new_key_candidate, line_content = pure_number_match.groups()
            new_key_candidate += '.'  # 添加点号
            line_content = line_content.lstrip('.．、,').replace(' ', '')

            # 提取当前键名和新键名的数值部分
            def extract_number(key):
                return float(key.rstrip('.').split('.')[0])

            current_key_num = extract_number(current_key) if current_key else 0
            new_key_num = extract_number(new_key_candidate)

            # 判断新键名是否为新的标题，条件是键名递增且差值在5以内
            if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
                # 处理临时标题与新主标题的关联
                if temp_title:
                    main_number = new_key_candidate.split('.')[0]
                    data[f"{main_number}."] = temp_title
                    temp_title = None  # 重置临时标题

                # 开始新的标题
                if current_key_chinese is not None:
                    data[current_key_chinese] = current_value_chinese
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key_candidate
                current_content = [line_content]
                append_newline = True
                last_main_number = new_key_candidate.rstrip('.')
            else:
                # 将当前行视为当前标题的内容
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
                                                       in_special_section)

        else:
            #前面的都没匹配上，那么当前处理行可能是大标题'一、总则' 或者是普通正文部分'应通知采购代理机构补全或更换，否则风险自负。'
            if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
                # 匹配上任意一种大标题形式：'一、xx' '（一）、xx' 'A.xx' 且不处于'特殊章节' ，继续执行后面的代码
                numbered_match = pattern_numbered.match(line_stripped)  # 一、
                parentheses_match = pattern_parentheses.match(line_stripped)  # （一）
                if i < 5:
                    pattern_letter = pattern_letter_initial
                else:
                    pattern_letter = re.compile(r'^([A-Z])[.．、]\s*(.*)$')
                letter_match = pattern_letter.match(line_stripped)  # A. 内容
                # arabic_match = pattern_arabic.match(line_stripped)          #1、内容

                # 判断当前行是否匹配了任何标题模式
                if numbered_match or parentheses_match or letter_match:
                    # 如果初始标题模式尚未设置，则记录当前匹配的标题模式
                    if initial_heading_pattern is None:
                        if numbered_match:
                            initial_heading_pattern = 'numbered'
                        elif parentheses_match:
                            initial_heading_pattern = 'parentheses'
                        elif letter_match:
                            initial_heading_pattern = 'letter'

                    # 确定当前匹配的标题模式
                    if numbered_match:
                        current_heading_pattern = 'numbered'
                    elif parentheses_match:
                        current_heading_pattern = 'parentheses'
                    elif letter_match:
                        current_heading_pattern = 'letter'

                    # 如果当前标题模式与初始标题模式一致，创建新的键值对
                    if current_heading_pattern == initial_heading_pattern:
                        new_key_chinese = None
                        new_value = None
                        # 保存之前的 key 的内容
                        if current_key_chinese is not None:
                            data[current_key_chinese] = current_value_chinese
                        if current_key is not None:
                            content_string = ''.join(current_content).strip()
                            # 如果已经有内容,添加换行符
                            if data.get(current_key):
                                data[current_key] = data.get(current_key) + '\n' + content_string
                            else:
                                data[current_key] = content_string
                            current_content = []
                        # 处理匹配到的标题
                        if current_heading_pattern == 'numbered':
                            new_key_chinese = numbered_match.group(1)
                            new_value = line_stripped[numbered_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'parentheses':
                            new_key_chinese = parentheses_match.group(1)
                            new_value = line_stripped[parentheses_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'letter':
                            letter_key, letter_value = letter_match.groups()
                            letter_to_chinese = {
                                'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
                                'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
                                'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
                            }
                            new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
                            new_value = letter_value.lstrip('.．、,').replace(' ', '')
                        # 比较数字大小
                        current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
                        new_number = get_current_number(new_key_chinese)
                        if new_number > current_number or current_key_chinese is None:
                            # 设置新的键值对
                            current_key_chinese = new_key_chinese
                            current_value_chinese = new_value
                            current_key = None
                        else:
                            # 如果新数字小于等于当前数字，将该行视为内容
                            if line_stripped:
                                append_newline = handle_content_append(current_content, line_stripped, append_newline,
                                                                       keywords, in_special_section)
                    else:
                        # 当前标题模式与初始模式不一致，将该行视为内容
                        if line_stripped:
                            append_newline = handle_content_append(current_content, line_stripped, append_newline,
                                                                   keywords, in_special_section)
                else:
                    # 未匹配到任何标题模式，将该行视为内容
                    if line_stripped:
                        append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
                                                               in_special_section)
            else:
                #该情况下，所有内容都作为当前标题的内容，添加到current_content中
                if line_stripped:
                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
                                                           in_special_section)

    # 最后保存最后一个 key 对应的内容
    if current_key is not None:
        content_string = ''.join(current_content).strip()
        data[current_key] = data.get(current_key, '') + content_string
    if current_key_chinese is not None:
        data[current_key_chinese] = current_value_chinese
    if temp_title:
        # 处理任何未关联的临时标题
        data["未分类标题"] = temp_title  # 您可以根据需要选择合适的键名

    return data


def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2):
    # start_pattern_2可能为0，从匹配位置的下一行开开始加入，而非匹配到的当前行，应付'三、招投标须知正文'的情况
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
    pdf_document = PdfReader(file_path)
    all_pages_text = []
    start_index = None
    # 处理所有页面
    for i, page in enumerate(pdf_document.pages):
        page_text = page.extract_text() if page.extract_text() else ""
        cleaned_text = clean_page_content(page_text, common_header)
        # 在第一页查找开始位置
        if i == 0 and start_index is None:
            # 按行拆分文本
            lines = cleaned_text.splitlines()
            for idx, line in enumerate(lines):
                # 只处理 start_pattern_1 和 start_pattern_2
                start_match_1 = regex.search(start_pattern_1, line)
                start_match_2 = None
                if start_pattern_2:
                    start_match_2 = regex.search(start_pattern_2, line)
                # 如果匹配第一个表达式
                if start_match_1:
                    start_index = cleaned_text.find(line)  # 找到匹配行的起始位置
                    cleaned_text = cleaned_text[start_index:]  # 从匹配行开始截取
                    break
                # 如果匹配第二个表达式，并且 start_pattern_2 不为 None
                elif start_match_2:
                    start_index = cleaned_text.find(line)  # 找到匹配行的起始位置
                    # 从匹配到的行的下一行开始截取
                    cleaned_text = cleaned_text[cleaned_text.find(lines[idx + 1]):] if idx + 1 < len(lines) else ""
                    break

        # 在最后一页查找结束位置
        if i == len(pdf_document.pages) - 1:
            matches = list(regex.finditer(end_pattern, cleaned_text, regex.MULTILINE))
            if matches:
                end_index = matches[-1].start()
                cleaned_text = cleaned_text[:end_index]

        lines = cleaned_text.split('\n')
        for j, line in enumerate(lines):
            if j == 0:
                lines[j] = '##' + line
        cleaned_text = '\n'.join(lines)

        all_pages_text.append(cleaned_text)

    # 合并所有页面的文本
    full_text = "\n".join(all_pages_text)
    return full_text


def convert_clause_to_json(file_path, output_folder, type=1):
    try:
        if not os.path.exists(file_path):
            print(f"The specified file does not exist: 返回空的clause_path")
            return ""
        if type == 2:
            start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$'
            start_pattern_2 = None
            end_pattern = (
                r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
                r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$'  # 第二部分
            )
        else:
            start_pattern_1 = (
                r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'  # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
                r'[一二12][、．.]+|'  # 匹配包含“一、二、12”以及逗号、点号等的部分
                r'[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'  # 匹配“说明、总则、名词解释”这类关键词
            )
            start_pattern_2 = (
                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前面的“见”，“与”，“”等字符
                r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|'  # 匹配“投标人”，“磋商”，“谈判”，“供应商”，“应答人”后跟“须知”
                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 添加“第X章/部分”部分
            )

            end_pattern = (
                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
                r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'
                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[：:]|'
                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[：:]|'
                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[：:]'
            )
        if file_path.endswith('.pdf'):
            text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
            # print(text)
        else:
            raise ValueError("Unsupported file format")
        parsed_data = parse_text_by_heading(text)
        # result = convert_to_json(input_path, start_word, end_pattern)
        # 检查输出文件夹是否存在，如果不存在则创建
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
            print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
        file_name = "clause1.json" if type == 1 else "clause2.json"
        # file_name = f"clause{suffix_counter}.json"
        output_path = os.path.join(output_folder, file_name)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(parsed_data, f, indent=4, ensure_ascii=False)
        print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
        return output_path
    except Exception as e:
        print(f"Error in convert_clause_to_json: {e}")
        return ""


if __name__ == "__main__":
    file_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\test\泉州白濑水利枢纽工程高低压配电装置设计、制造及采购 (1)_tobidders_notice_part2.pdf'
    # file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
    # file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
    output_folder = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\test'
    try:
        output_path = convert_clause_to_json(file_path, output_folder)
        print(f"Final JSON result saved to: {output_path}")
    except ValueError as e:
        print("Error:", e)