zbparse/flask_app/main/ttt.py

import json
import docx
import re
import os
from PyPDF2 import PdfReader
from flask_app.main.截取pdf import clean_page_content,extract_common_header

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs])


def extract_text_from_pdf(file_path):
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
    pdf_document = PdfReader(file_path)
    text = ""
    # 遍历每一页
    for page in pdf_document.pages:
        # 提取当前页面的文本
        page_text = page.extract_text() if page.extract_text() else ""
        # 清洗页面文本
        page_text = clean_page_content(page_text, common_header)
        # 将清洗后的文本添加到总文本中
        text += page_text+"\n"
    return text

def extract_section(text, start_pattern, end_phrases):
    # 查找开始模式
    start_match = re.search(start_pattern, text)
    if not start_match:
        return ""  # 如果没有找到匹配的开始模式，返回空字符串
    start_index = start_match.end()  # 从匹配的结束位置开始

    # 初始化结束索引为文本总长度
    end_index = len(text)

    # 遍历所有结束短语，查找第一个出现的结束短语
    for phrase in end_phrases:
        match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
        if match:
            end_index = start_index + match.start()  # 更新结束索引为匹配到的开始位置
            break  # 找到第一个匹配后立即停止搜索

    # 提取并返回从开始模式后到结束模式前的内容
    return text[start_index:end_index]

def compare_headings(current, new):
    # 使用过滤来确保只处理非空且为数字的部分
    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
    new_nums = [int(num) for num in new.split('.') if num.isdigit()]

    # 比较数字序列以确定标题的层次关系
    for c, n in zip(current_nums, new_nums):
        if n > c:
            return True
        elif n < c:
            return False

    # 如果新标题有更多层次，认为是新的子章节
    return len(new_nums) > len(current_nums)


def should_add_newline(content, keywords, max_length=20):
    content_str = ''.join(content).strip()
    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length

def handle_content_append(current_content, line_content, append_newline, keywords):
    if append_newline:
        if should_add_newline(current_content, keywords):
            current_content.append('\n')  # 添加换行符
        append_newline = False
    current_content.append(line_content)
    return append_newline

#对二级标题如x.x进行额外处理：如果当前处理内容包含keywords中的内容，则必须保留换行符/如果当前内容字数大于20，不保留换行。
def parse_text_by_heading(text):
    keywords = ['包含', '以下']
    data = {}
    current_key = None
    current_content = []
    append_newline = False

    lines = text.split('\n')
    for i, line in enumerate(lines):
        line_stripped = line.strip()
        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)

        if match:
            new_key, line_content = match.groups()
            line_content = line_content.lstrip('.')
            # 检查是否应该更新当前键和内容
            if current_key is None or (compare_headings(current_key, new_key) and (
                    len(current_content) == 0 or current_content[-1][-1] != '第')):
                if current_key is not None:
                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
                    content_string = ''.join(current_content).strip()
                    data[current_key] = content_string.replace(' ', '')
                current_key = new_key
                current_content = [line_content]
                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
                append_newline = len(new_key.split('.')) == 2
            else:
                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
        else:
            if line_stripped:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)

    if current_key is not None:
        # 保存最后一部分内容
        content_string = ''.join(current_content).strip()
        data[current_key] = content_string.replace(' ', '')

    return data

def convert_to_json(file_path, start_word, end_phrases):
    if file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    elif file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    else:
        raise ValueError("Unsupported file format")
    # 提取从 start_word 开始到 end_phrases 结束的内容
    text = extract_section(text, start_word, end_phrases)
    # print(text)
    parsed_data = parse_text_by_heading(text)
    return parsed_data

def convert_clause_to_json(input_path,output_folder,type=1):
    if not os.path.exists(input_path):
        print(f"The specified file does not exist: {input_path}")
        return ""
    if type==1:
        start_word = "投标人须知正文"
        end_phrases = [
            r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
            r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
        ]
    else:
        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：'
        end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
    result = convert_to_json(input_path, start_word, end_phrases)
    file_name = "clause1.json" if type == 1 else "clause2.json"
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    post_process_json(output_path)
    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
    return output_path

def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
    # 读取 JSON 文件
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    processed_data = {}

    for key, value in data.items():
        # 检查是否是一级标题（如 '5.'），并且其值包含 '\n'
        if re.match(r'^\d+\.\s*$', key) and '\n' in value:
            # 分割标题和正文
            title, content = value.split('\n', 1)

            # 添加原来的标题作为 '5.0'，其值为原来标题的内容（即 title）
            processed_data[key] = title.strip()
            sub_key = f"{key.rstrip('.')}." + "0"  # 自动生成 '5.0'，与 '5.' 一致，保证点号的存在

            processed_data[sub_key] = title.strip()

            # 初始化计数器
            sub_count = 1

            # 根据子序号 '1.' 或 '1、' 进行分割
            sub_sections = re.split(r'(\d+[\.\、])\s*', content)

            current_sub_content = ""
            for i in range(1, len(sub_sections), 2):
                sub_number = sub_sections[i].strip()  # 获取子序号
                sub_content = sub_sections[i + 1].strip()  # 获取内容

                # 生成三级标题，如 '5.0.1', '5.0.2'
                sub_key_with_number = f"{sub_key}.{sub_count}"
                processed_data[sub_key_with_number] = sub_content
                sub_count += 1

        else:
            # 如果没有分割需求，保留原数据
            processed_data[key] = value

    # 将修改后的数据重新写入到原来的 JSON 文件中
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(processed_data, file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf'
    # start_word = "投标人须知正文"
    # end_phrases = [
    #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
    #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
    # ]
    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder)
        print(f"Final JSON result saved to: {output_path}")
    except ValueError as e:
        print("Error:", e)