zbparse/flask_app/general/无效标和废标公共代码.py

import json
import os
import re
import regex
import time
from concurrent.futures import ThreadPoolExecutor
from flask_app.general.doubao import generate_full_user_query
from flask_app.general.insert_del_pagemark import insert_mark
from flask_app.general.通义千问long import qianwen_plus
from flask_app.general.通用功能函数 import process_string_list
from collections import OrderedDict
from docx import Document


def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达式提取到的东西格式化
    all_text1 = {}  # 用于存储 len(text_list) == 1 的结果，按序号保存
    all_text2 = {}  # 用于存储 len(text_list) > 1 的结果，按序号保存
    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
    split_pattern = r'(?<=[。！？\!\?])'
    clean_pattern = (r'^\s*(?:[（(]\s*\d+\s*[)）)]|'
                     r'[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|'
                     r'[一二三四五六七八九十]+、|'
                     r'[A-Za-z][)）\.、．]?\s*)')
    idx = 0  # 共享的序号标记
    for key, text_list in extracted_contents.items():
        if len(text_list) == 1:
            for data in text_list:
                # print(data)
                # 检查是否包含任何需要排除的字符串
                if any(exclude in data for exclude in excludes):
                    continue  # 如果包含任何排除字符串，跳过这个数据
                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
                data = re.sub(clean_pattern, '', data).strip()
                keyword_match = re.search(keywords, data)
                if keyword_match:
                    # 从关键词位置开始查找结束标点符号
                    start_pos = keyword_match.start()
                    # 截取从关键词开始到后面的内容
                    substring = data[start_pos:]
                    # 按定义的结束标点分割
                    sentences = re.split(split_pattern, substring, 1)
                    if len(sentences) > 0 and sentences[0]:
                        # 只取第一句，保留标点
                        cleaned_text = data[:start_pos] + sentences[0]  # eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
                    else:
                        cleaned_text = data  # 如果没有标点，使用整个字符串
                else:
                    # 如果没有找到关键词，保留原文本
                    cleaned_text = data
                # 删除空格
                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
                # 如果长度大于8，则添加到结果列表
                if len(cleaned_text_no_spaces) > 8:
                    all_text1[idx] = cleaned_text_no_spaces
                    idx += 1  # 更新共享序号
        else:
            # print(text_list)
            # print("*********")
            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
            data = re.sub(clean_pattern, '', text_list[0]).strip()  # 只去除第一个的序号
            # 将修改后的第一个元素和剩余的元素连接起来
            text_list[0] = data  # 更新列表中的第一个元素
            joined_text = "\n".join(text_list)  # 如果列表中有多个元素，则连接它们
            # 删除空格
            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
            all_text2[idx] = joined_text_no_spaces
            idx += 1  # 更新共享序号

    return all_text1, all_text2 # all_texts1要额外用gpt   all_text2直接返回结果

#处理跨页的段落
def preprocess_paragraphs(elements):
    processed = []  # 初始化处理后的段落列表
    index = 0
    flag = False  # 初始化标志位
    is_combine_table = False

    # 定义两个新的正则表达式模式
    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')

    # 定义列表项的模式
    list_item_pattern = re.compile(
        r'^\s*('
        r'[（\(]\d+[）\)]|'  # 匹配：(1) 或 （1）
        r'[A-Za-z]\.\s*|'  # 匹配：A. 或 b.
        r'[一二三四五六七八九十]+、|'  # 匹配：一、二、三、
        r'第[一二三四五六七八九十百零]+[章节部分节]|'  # 匹配：第x章，第x部分，第x节
        r'[A-Za-z]\d+(?:\.\d+)*[\s\.、．)\）]?|'  # 匹配：A1.2 等
        r'\d+(?:\.\d+)+[\s\.、．)\）]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|'  # 匹配：数字序号如1.1 1.1.1
        r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|'  # 数字后空格，空格后非指定关键字
        r'(?=\d+[、.．])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])'  # 数字后直接跟顿号或点号
        r')'
    )

    # 新增的正则表达式，用于匹配以数字序号开头的段落
    pattern_numeric_header = re.compile(
        r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)'  # 匹配如 '12.1 内容'
    )
    pattern_numeric_header_fallback = re.compile(
        r'^(\d+\.)\s*(.+)$'  # 匹配如 '12. 内容'
    )

    # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
    def has_long_spaces(text, max_space_count=5):
        return any(len(space) > max_space_count for space in re.findall(r'\s+', text))

    # 正则表达式用于检测页面标记
    pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')

    # 辅助函数：查找上一个非空且非标记的段落
    def find_prev_text(current_index):
        for i in range(current_index - 1, -1, -1):
            if isinstance(elements[i], str):
                return '', -1
            try:
                text = elements[i].text.strip()
            except AttributeError:
                continue  # 如果段落对象没有 text 属性，跳过
            if text and not pattern_marker.search(text):
                return text, i
        return '', -1

    # 辅助函数：查找下一个非空且非标记的段落
    def find_next_text(current_index):
        for i in range(current_index + 1, len(elements)):
            if isinstance(elements[i], str):
                return '', -1
            try:
                text = elements[i].text.strip()
            except AttributeError:
                continue  # 如果段落对象没有 text 属性，跳过
            # 跳过空白段落和页面标记
            if not text or pattern_marker.search(text):
                continue
            # 跳过匹配排除模式的段落
            if (pattern_numbered.match(text) or pattern_parentheses.match(text)) and len(text) < 8:
                continue
            return text, i
        return '', -1

    while index < len(elements):
        if isinstance(elements[index], str):
            processed.append(elements[index])
            index += 1
            continue
        try:
            current_text = elements[index].text.strip()  # 去除当前段落的前后空白
        except AttributeError:
            # 如果段落对象没有 text 属性，跳过该段落
            index += 1
            continue

        # 检查当前段落是否为页面标记
        if pattern_marker.search(current_text):
            # 动态查找前一个非空段落
            prev_text, prev_index = find_prev_text(index)
            # 动态查找后一个非空段落
            next_text, next_index = find_next_text(index)

            # 应用现有的合并逻辑
            if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
                if not prev_text.endswith(('。', '!', '?')):  # '，', ',',  先注释了，如果逗号，可能还没结束。
                    # 检查后一个段落是否为列表项
                    if not list_item_pattern.match(next_text) and len(prev_text) > 30:
                        # 合并前后段落
                        merged_text = prev_text + ' ' + next_text  # 为了可读性添加空格
                        if prev_index < len(elements):
                            # 移除 processed 中的前一个段落
                            if processed and processed[-1] == prev_text:
                                processed.pop()
                            # 添加合并后的文本
                            processed.append(merged_text)

                        # 跳过标记以及前后所有空白段落，直到 next_index
                        index = next_index + 1
                        continue  # 继续下一个循环

            # 如果不满足合并条件，跳过标记及其周围的空白段落
            # 计算下一个需要处理的索引
            # 从当前 index 向下，跳过所有连续的空白段落和标记
            skip_index = index + 1
            while skip_index < len(elements):
                if isinstance(elements[skip_index], str):
                    break
                try:
                    skip_text = elements[skip_index].text.strip()
                except AttributeError:
                    skip_index += 1
                    continue  # 如果段落对象没有 text 属性，跳过
                if skip_text == '' or pattern_marker.search(skip_text):
                    skip_index += 1
                else:
                    break
            index = skip_index
            continue  # 继续下一个循环

        # 检查当前段落是否匹配任一排除模式
        if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
            # 如果匹配，则跳过当前段落，不添加到processed列表中
            index += 1
            continue

        # 检查是否为以数字序号开头的段落
        match = pattern_numeric_header.match(current_text)
        if not match:
            match = pattern_numeric_header_fallback.match(current_text)

        if match:
            # 当前段落以数字序号开头，直接添加到 processed
            processed.append(current_text)
            flag = True  # 设置标志位，准备处理下一个段落
            index += 1
            continue
        else:
            if flag:
                if not list_item_pattern.match(current_text):
                    if processed:
                        # **新增逻辑开始**
                        next_non_empty_text, next_non_empty_index = find_next_text(index)
                        is_next_numbered = False
                        if next_non_empty_text:
                            is_next_numbered = bool(
                                pattern_numeric_header.match(next_non_empty_text) or
                                pattern_numeric_header_fallback.match(next_non_empty_text)
                            )

                        if is_next_numbered and len(processed[-1]) > 30:
                            # 只有在下一个段落以数字序号开头且上一个段落长度大于30时，才将当前段落追加到上一个段落
                            processed[-1] = processed[-1] + ' ' + current_text
                        else:
                            # 否则，不追加，而是作为新的段落添加
                            processed.append(current_text)
                        # **新增逻辑结束**
                else:
                    # **新增处理：匹配 list_item_pattern 的段落也应被保存**
                    processed.append(current_text)
                # 无论是否追加，都将 flag 重置
                flag = False
                index += 1
                continue
            else:
                # flag 为 False，直接添加到 processed
                processed.append(current_text)
                index += 1
                continue

    return processed

def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keywords):
    if isinstance(keywords, str):
        keywords = [keywords]
    extracted_paragraphs = OrderedDict()
    continue_collecting = False
    current_section_pattern = None
    active_key = None

    def match_keywords(text, patterns):
        # 首先检查关键词是否匹配
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return True
        return False

    def extract_from_text(text, current_index):
        nonlocal continue_collecting, current_section_pattern, active_key
        if text == "":
            return current_index

        if continue_collecting:
            # 如果是收集状态，并且下面有表格，则把表格内容全部追加到active_key中去
            if text == '[$$table_start$$]':
                current_index += 1
                while (processed_paragraphs[current_index] != '[$$table_over$$]'):
                    extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
                    current_index += 1
                return current_index
            if current_section_pattern and re.match(current_section_pattern, text):
                continue_collecting = False
                active_key = None
            else:
                if active_key is not None:
                    extracted_paragraphs[active_key].append(text)
                    return current_index

        if match_keywords(text, keywords):
            active_key = text
            extracted_paragraphs[active_key] = [text]
            if match_keywords(text, follow_up_keywords):
                continue_collecting = True
                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
                if section_number:         #当前匹配的行前有序号,那么就匹配到下个相似序号为止停止收集
                    current_section_number = section_number.group(1)
                    level_count = current_section_number.count('.')
                    # 获取章节的各级部分
                    parts = current_section_number.split('.')
                    # Pattern to match current level, e.g., 3.4.5  添加负向前瞻以防止匹配四级或更高层级
                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+' + r'(?!\s*[.．]\s*\d+)'
                    matched_patterns = [pattern]  # start with the full pattern

                    # for i in range(1, 6):     #同级，与matched_patterns = [pattern]重复了，故注释
                    #     # 复制 parts 列表以避免修改原列表
                    #     new_parts = parts.    copy()
                    #     new_parts[-1] = str(int(new_parts[-1]) + i)
                    #     # 使用不同的分隔符
                    #     next_pattern = r'^' + r'\s*[.．]\s*'.join(new_parts)
                    #     matched_patterns.append(next_pattern)

                    # Parent section (if applicable)
                    if len(parts) > 1:
                        for i in range(1, 6):  #考虑原文档的书写不规范，跳序号的情况，目前设置了范围<5
                            parent_section_parts = parts[:-1].copy()
                            parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
                            parent_pattern = r'^' + r'\s*[.．]\s*'.join(parent_section_parts)+ r'(?!\s*[.．]\s*\d+)'
                            matched_patterns.append(parent_pattern)

                    # 添加对 '数字 、' 格式的支持
                    digit_comma_pattern = r'^\d+\s*、'
                    matched_patterns.append(digit_comma_pattern)

                    # 获取当前顶级章节编号
                    current_top_level_num = int(current_section_number.split('.')[0])
                    for i in range(1, 6):
                        next_top_level_num = current_top_level_num + i
                        next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.．]'
                        # 检查是否已经包含了该模式，避免重复
                        if next_top_level_pattern not in matched_patterns:
                            matched_patterns.append(next_top_level_pattern)

                    # Combine the patterns
                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
                    current_section_pattern = re.compile(combined_pattern)

                else:
                    found_next_number = False
                    current_section_pattern = None

                    while current_index < len(processed_paragraphs) - 1:
                        current_index += 1
                        next_text = processed_paragraphs[current_index].strip()
                        # 添加对空白行的处理
                        if not next_text:
                            continue  # 跳过空白行，进入下一个循环
                        if not found_next_number:
                            # 修改后的正则，支持 '数字 、' 格式
                            number_pattern=(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)'     #A1  1.1   abc.123
                                            r'|([（(]\s*[一二三四五六七八九十\d]+\s*[）)])'       #(1) （2）
                                            r'|([一二三四五六七八九十\d]+\s*、)')              #1、 2、
                            next_section_number = re.match(number_pattern,next_text)
                            if next_section_number:
                                found_next_number = True
                                if next_section_number.group(1):
                                    section_parts = next_section_number.group(1).split('.')
                                    dynamic_pattern = r'^' + r'[.．]'.join(
                                        [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
                                elif next_section_number.group(2):
                                    dynamic_pattern = r'^[\(\（]\s*[一二三四五六七八九十\d]+\s*[\)\）]'
                                elif next_section_number.group(3):
                                    dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、'
                                current_section_pattern = re.compile(dynamic_pattern)
                        if current_section_pattern and re.match(current_section_pattern, next_text):
                            extracted_paragraphs[active_key].append(next_text)
                        else:
                            continue_collecting = False
                            active_key = None
                            break

        return current_index

    index = 0
    while index < len(processed_paragraphs):
        # print(processed_paragraphs[index].strip())
        index = extract_from_text(processed_paragraphs[index].strip(), index)
        # print("--------------")
        index += 1
    return extracted_paragraphs

# 分割表格中单元格文本
def split_cell_text(text):
    # 定义用于提取括号内容的正则表达式，支持中英文括号
    bracket_pattern = re.compile(r'[（(][^（()）)]+[）)]')

    # 1. 先提取并替换括号内容
    bracket_contents = []

    def replace_bracket_content(match):
        bracket_contents.append(match.group(0))  # 保存括号内容
        return f"<BRACKET_{len(bracket_contents) - 1}>"  # 使用占位符替换括号内容

    item_with_placeholders = bracket_pattern.sub(replace_bracket_content, text)
    # print("-----------")
    # print(text)
    # 2. 分割句子，保证句子完整性（按标点符号和序号分割）
    split_sentences = regex.split(
        r'(?<=[。！？!?\?])|'  # 在中文句号、感叹号、问号后分割
        r'(?<![A-Za-z]\s*)(?<!\d[.．]?)(?=\d+(?:[.．]\d+)+(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|'  # 匹配多级编号，限制后面不能是指定关键字
        r'(?<![+\-×÷*/.\．A-Za-z]\s*|\d)(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|'  # 数字后跟空格且不跟指定关键字时分割，且前面不包含 . 或 ． eg:'1.1 xx'
        r'(?<![+\-×÷*/.\．A-Za-z]\s*|\d)(?=\d+[、.．](?!\d|(?:\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家])))|'  # 数字后直接跟顿号、点号时分割，且点号后不跟数字  eg:'1.'
        r'(?<![A-Za-z])(?=[A-Za-z][.．]\s*(?![A-Za-z]))|'  # 单个字母+点号或单个字母+数字，排除www.baidu.com 网址情况 
        r'(?=[A-Za-z]+\s*\d+\s*(?:[.．]\s*\d+)*)|'  # 在字母加数字或多级编号前分割
        r'(?<=^|\n)(?=[一二三四五六七八九十]+、)',  # 在中文数字加顿号（如一、二、）前分割
        item_with_placeholders
    )

    # 3. 还原括号内容
    split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
                       split_sentences]
    # 4. 过滤空字符串
    split_sentences = [s for s in split_sentences if s.strip()]
    # print(split_sentences)
    return split_sentences

# 文件预处理----按文件顺序提取文本和表格,并合并跨页表格
def extract_file_elements(file_path):
    doc = Document(file_path)
    doc_elements = doc.element.body
    doc_paragraphs = doc.paragraphs
    doc_tables = doc.tables
    pre_table_head = None
    table_combine = False
    paragraph_index = 0
    tables_index = 0
    doc_contents = []

    pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')

    # 遍历文件元素
    for element in doc_elements:
        # 如果是段落
        if element.tag.endswith('}p'):
            if pre_table_head:
                text = doc_paragraphs[paragraph_index].text
                # 如果上一个是表格，并且之后没有文本或为跨页标记，则不提取
                if (text == '' or pattern_marker.search(text)):
                    paragraph_index += 1
                    continue
                # 如果遇到有效文本，则说明表格提取完毕
                else:
                    doc_contents.append('[$$table_over$$]')
                    table_combine = False
                    pre_table_head = None
            doc_contents.append(doc_paragraphs[paragraph_index])
            paragraph_index += 1
        # 如果是表格
        elif element.tag.endswith('}tbl'):
            table = doc_tables[tables_index]
            table_content = []
            for row_idx, row in enumerate(table.rows):
                if row_idx == 0:
                    # 跳过表头
                    if pre_table_head:
                        table_combine = True
                        if pre_table_head == row.cells[0].text:
                            continue
                    # 记录初始表头
                    else:
                        pre_table_head = row.cells[0].text
                        doc_contents.append('[$$table_start$$]')
                        continue
                # 遍历每一行中的单元格
                for cell in row.cells:
                    cell_text = cell.text.strip()  # 去除单元格内容前后空白
                    if len(cell_text) > 8:  # 检查文字数量是否大于8
                        cell_text = split_cell_text(cell_text)
                        table_content += cell_text
            # 合并跨页表格
            if table_combine:
                if not doc_contents[-1].endswith(('。', '!', '?', ';')):
                    doc_contents[-1] += ' ' + table_content[0]
                    table_content.pop(0)
            doc_contents.extend(table_content)
            # doc_contents.append('[$$table_over$$]')
            tables_index += 1
    return doc_contents

def handle_query(file_path, user_query, output_file, result_key, keywords):
    try:
        excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证：", "我方"]
        follow_up_keywords = [
            r'情\s*形\s*之\s*一',
            r'情\s*况\s*之\s*一',
            r'下\s*列(?!\s*公式)',       # 增加负向前瞻，排除“下列公式”
            r'以\s*下(?!\s*公式)',       # 增加负向前瞻，排除“以下公式”
            r'其\s*他.*?情\s*形\s*[:：]',
            r'包\s*括'
        ]

        doc_contents = extract_file_elements(file_path)
        processed_paragraphs = preprocess_paragraphs(doc_contents)
        extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
        all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
        # print(all_texts2)

        # 1. 得到有序的 all_text1_items
        all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
        # 2. 得到纯内容列表
        all_texts1_list = [content for (_, content) in all_text1_items]
        # Proceed only if there is content to write
        selected_contents = {}
        final_list=[f"未解析到'{result_key}'！"]
        seen_contents = set()  # 使用集合跟踪已添加的内容以去重
        if all_texts1_list or all_texts2:
            with open(output_file, 'w', encoding='utf-8') as file:
                counter = 1
                for content in all_texts1_list:
                    # 使用内容的前25个字符作为去重的依据
                    key = content[:25]  # 提取前25个字符
                    if key not in seen_contents:  # 如果前30个字符未出现过
                        file.write(f"{counter}. {content}\n")
                        file.write("..............." + '\n')
                        seen_contents.add(key)  # 标记前30个字符为已写入
                        counter += 1

            # 生成用户查询
            if not all_texts1_list:
                num_list=[]
            else:
                user_query = generate_full_user_query(output_file, user_query)
                model_ans = qianwen_plus(user_query)  # 豆包模型返回结果
                # file_id = upload_file(output_file)
                # model_ans = qianwen_long(file_id, user_query)
                num_list = process_string_list(model_ans)  # 处理模型返回的序号
            print(result_key + "选中的序号:" + str(num_list))

            for index in num_list:
                if 1 <= index <= len(all_texts1_list):
                    original_global_idx = all_text1_items[index - 1][0]
                    content = all_text1_items[index - 1][1]
                    selected_contents[original_global_idx] = content
            # 把选中的 all_text1 内容 + all_text2 合并
            merged_dict = {}
            # 先合并用户选中的 all_text1
            for idx, txt in selected_contents.items():
                merged_dict[idx] = txt
            # 再合并 all_text2
            for idx, txt in all_texts2.items():
                merged_dict[idx] = txt
            final_list = [txt for idx, txt in sorted(merged_dict.items(), key=lambda x: x[0])]

        return {result_key: final_list}
    except Exception as e:
        print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
        return {result_key: [f"未解析到'{result_key}'！"]}


def combine_find_invalid(invalid_docpath, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    queries = [
        (
            r'否\s*决|'
            r'无\s*效\s*投\s*标|'
            r'无\s*效\s*文\s*件|'
            r'(?:文\s*件|投\s*标|响\s*应)\s*[\u4e00-\u9fa5]?\s*(?:无|失)\s*效|'
            r'无\s*效\s*响\s*应|'
            r'无\s*效\s*报\s*价|'
            r'无\s*效\s*标|'
            r'视\s*为\s*无\s*效|'
            r'被\s*拒\s*绝|'
            r'将\s*拒\s*绝|'
            r'予\s*以\s*拒\s*绝',
            """以下是从招标文件中摘取的内容，文本中序号分明，各信息之间以...............分割。
任务目标：
从文本中筛选所有描述否决投标，拒绝投标，投标、响应无效或类似表述的情况，并返回对应的序号。
要求与指南：
    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
输出格式：
    以 [x, x, x] 的形式返回，x 为符合条件的信息的序号，为自然数。
    如果文本中没有符合条件的信息，请返回 []。
特殊情况：
    如果某序号的内容明显分为几部分且一部分内容符合筛选条件，但其他部分明显是无关内容，请返回符合部分的字符串内容代替序号。
示例输出,仅供格式参考：
    [1,3,4,6]
文本内容：{full_text}
            """,
            os.path.join(output_dir, "temp1.txt"),
            "否决和无效投标情形"
        ),
        (
            r'废\s*标',
            """以下是从招标文件中摘取的内容，文本中序号分明，文本内之间的信息以'...............'分割。
任务目标：
请根据以下内容，筛选出 废标项的情况 （明确描述导致 废标 的情况）并返回对应的序号。
要求与指南：
    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
输出格式：
    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的信息的序号，为自然数。
    如果文本中没有任何符合条件的废标情况，请返回 []。
示例输出,仅供格式参考：
    [1,3,4,6]
文本内容：{full_text}
            """,
            os.path.join(output_dir, "temp2.txt"),
            "废标项"
        ),
        (
            r'不\s*得(?!\s*(分|力))|禁\s*止\s*投\s*标',
            """以下是从招标文件中摘取的内容，文本中序号分明，文本内的条款以'...............'分割。条款规定了各方不得存在的情形。请根据以下要求进行筛选：
**投标相关主体与非投标相关主体的定义**：
    投标相关主体：包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
    非投标相关主体：包括但不限于“招标人”、“采购人”、“评标委员会”或其他描述非投标方的词语。
**筛选要求**：
1. **仅筛选**明确描述投标相关主体禁止情形或不得存在的情形的条款，不包含笼统或未具体说明情形的条款。例如：
    若条款内容包含'投标人不得存在的其他关联情形'这样的笼统描述，而未说明具体的情形，则无需添加该条款。
2. **排除**仅描述非投标相关主体行为限制或禁止情形的条款，例如“招标人不得泄露信息”或“评标委员会不得收受贿赂”，则无需返回。
3. 若条款同时描述了对投标相关主体与非投标相关主体的行为限制、禁止情形，也需返回。
4. **特殊情况**：如果条款中包含“磋商小组”、”各方“等既能指代投标相关主体又能指代非投标相关主体的词汇：
    若在语境中其指代或包含投标相关主体，则应将其考虑在内；否则，排除该条款。

**输出格式**：
    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的条款的序号，为自然数。
    如果没有符合条件的条款，返回 `[]`。
**示例**：
- **符合条件**：
  - `1. 投标人不得...` → 包含，返回序号 1。
  - `3. 联合体投标各方不得...` → 包含，返回序号 3。
- **不符合条件**：
  - `2. 采购人不得...` → 主语为“采购人”，排除。
-示例输出: [1,3]
请根据上述筛选要求，阅读以下文本内容，并返回符合条件的条款序号，

文本内容：{full_text}
            """,
            os.path.join(output_dir, "temp3.txt"),
            "不得存在的情形"
        )
    ]
    results = []

    # 使用线程池来并行处理查询
    with ThreadPoolExecutor() as executor:
        futures = []
        for keywords, user_query, output_file, result_key in queries:
            future = executor.submit(handle_query, invalid_docpath, user_query, output_file, result_key, keywords)
            futures.append((future, result_key))  # 保持顺序
            time.sleep(0.5)  # 暂停0.5秒后再提交下一个任务

        for future, result_key in futures:
            try:
                result = future.result()
            except Exception as e:
                print(f"线程处理 {result_key} 时出错: {e}")
                result = {result_key: ""}
            results.append(result)
    combined_dict = {}
    for d in results:
        combined_dict.update(d)

    print("无效标与废标done...")
    return {"无效标与废标项": combined_dict}

if __name__ == '__main__':
    start_time = time.time()
    # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件（实高电子显示屏）_tobidders_notice_part1.docx"
    # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
    # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
    # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
    pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件（N510113202400010820250102001）.pdf'

    output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
    # invalid_added = insert_mark(pdf_path)
    # # invalid_added_docx = pdf2docx(invalid_added)
    invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
    # # print("Elapsed time:", str(end_time - start_time))
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								import json
 								import os
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								import re
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								import regex
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								import time
 								from concurrent.futures import ThreadPoolExecutor
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								from flask_app.general.doubao import generate_full_user_query
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								from flask_app.general.insert_del_pagemark import insert_mark
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								from flask_app.general.通义千问long import qianwen_plus
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								from flask_app.general.通用功能函数 import process_string_list
-.16

											
										
										
											2024-12-16 13:56:28 +08:00
+								from collections import OrderedDict
 								from docx import Document
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达式提取到的东西格式化
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    all_text1 = {}  # 用于存储 len(text_list) == 1 的结果，按序号保存
 								    all_text2 = {}  # 用于存储 len(text_list) > 1 的结果，按序号保存
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
 								    split_pattern = r'(?<=[。！？\!\?])'
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    clean_pattern = (r'^\s*(?:[（(]\s*\d+\s*[)）)]|'
 								                     r'[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|'
 								                     r'[一二三四五六七八九十]+、|'
 								                     r'[A-Za-z][)）\.、．]?\s*)')
 								    idx = 0  # 共享的序号标记
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    for key, text_list in extracted_contents.items():
 								        if len(text_list) == 1:
 								            for data in text_list:
 								                # print(data)
 								                # 检查是否包含任何需要排除的字符串
 								                if any(exclude in data for exclude in excludes):
 								                    continue  # 如果包含任何排除字符串，跳过这个数据
 								                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                data = re.sub(clean_pattern, '', data).strip()
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								                keyword_match = re.search(keywords, data)
 								                if keyword_match:
 								                    # 从关键词位置开始查找结束标点符号
 								                    start_pos = keyword_match.start()
 								                    # 截取从关键词开始到后面的内容
 								                    substring = data[start_pos:]
 								                    # 按定义的结束标点分割
 								                    sentences = re.split(split_pattern, substring, 1)
 								                    if len(sentences) > 0 and sentences[0]:
 								                        # 只取第一句，保留标点
 								                        cleaned_text = data[:start_pos] + sentences[0]  # eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
 								                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
 								                    else:
 								                        cleaned_text = data  # 如果没有标点，使用整个字符串
 								                else:
 								                    # 如果没有找到关键词，保留原文本
 								                    cleaned_text = data
 								                # 删除空格
 								                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
 								                # 如果长度大于8，则添加到结果列表
 								                if len(cleaned_text_no_spaces) > 8:
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                    all_text1[idx] = cleaned_text_no_spaces
 								                    idx += 1  # 更新共享序号
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								        else:
 								            # print(text_list)
 								            # print("*********")
 								            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            data = re.sub(clean_pattern, '', text_list[0]).strip()  # 只去除第一个的序号
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            # 将修改后的第一个元素和剩余的元素连接起来
 								            text_list[0] = data  # 更新列表中的第一个元素
 								            joined_text = "\n".join(text_list)  # 如果列表中有多个元素，则连接它们
 								            # 删除空格
 								            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            all_text2[idx] = joined_text_no_spaces
 								            idx += 1  # 更新共享序号
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    return all_text1, all_text2 # all_texts1要额外用gpt   all_text2直接返回结果
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
 								#处理跨页的段落
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								def preprocess_paragraphs(elements):
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								    processed = []  # 初始化处理后的段落列表
 								    index = 0
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								    flag = False  # 初始化标志位
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    is_combine_table = False
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
-.13

											
										
										
											2024-11-13 09:41:37 +08:00
+								    # 定义两个新的正则表达式模式
 								    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
 								    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								    # 定义列表项的模式
 								    list_item_pattern = re.compile(
 								        r'^\s*('
 								        r'[（\(]\d+[）\)]|'  # 匹配：(1) 或 （1）
 								        r'[A-Za-z]\.\s*|'  # 匹配：A. 或 b.
 								        r'[一二三四五六七八九十]+、|'  # 匹配：一、二、三、
 								        r'第[一二三四五六七八九十百零]+[章节部分节]|'  # 匹配：第x章，第x部分，第x节
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								        r'[A-Za-z]\d+(?:\.\d+)*[\s\.、．)\）]?|'  # 匹配：A1.2 等
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								        r'\d+(?:\.\d+)+[\s\.、．)\）]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|'  # 匹配：数字序号如1.1 1.1.1
 								        r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|'  # 数字后空格，空格后非指定关键字
 								        r'(?=\d+[、.．])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])'  # 数字后直接跟顿号或点号
 								        r')'
 								    )
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								    # 新增的正则表达式，用于匹配以数字序号开头的段落
 								    pattern_numeric_header = re.compile(
 								        r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)'  # 匹配如 '12.1 内容'
 								    )
 								    pattern_numeric_header_fallback = re.compile(
 								        r'^(\d+\.)\s*(.+)$'  # 匹配如 '12. 内容'
 								    )
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								    # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								    def has_long_spaces(text, max_space_count=5):
 								        return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
 								    # 正则表达式用于检测页面标记
 								    pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
 								    # 辅助函数：查找上一个非空且非标记的段落
 								    def find_prev_text(current_index):
 								        for i in range(current_index - 1, -1, -1):
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            if isinstance(elements[i], str):
 								                return '', -1
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								            try:
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								                text = elements[i].text.strip()
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								            except AttributeError:
 								                continue  # 如果段落对象没有 text 属性，跳过
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            if text and not pattern_marker.search(text):
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                return text, i
 								        return '', -1
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
 								    # 辅助函数：查找下一个非空且非标记的段落
 								    def find_next_text(current_index):
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								        for i in range(current_index + 1, len(elements)):
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            if isinstance(elements[i], str):
 								                return '', -1
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								            try:
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								                text = elements[i].text.strip()
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								            except AttributeError:
 								                continue  # 如果段落对象没有 text 属性，跳过
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            # 跳过空白段落和页面标记
 								            if not text or pattern_marker.search(text):
 								                continue
 								            # 跳过匹配排除模式的段落
 								            if (pattern_numbered.match(text) or pattern_parentheses.match(text)) and len(text) < 8:
 								                continue
 								            return text, i
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								        return '', -1
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    while index < len(elements):
 								        if isinstance(elements[index], str):
 								            processed.append(elements[index])
 								            index += 1
 								            continue
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								        try:
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            current_text = elements[index].text.strip()  # 去除当前段落的前后空白
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								        except AttributeError:
 								            # 如果段落对象没有 text 属性，跳过该段落
 								            index += 1
 								            continue
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
 								        # 检查当前段落是否为页面标记
 								        if pattern_marker.search(current_text):
 								            # 动态查找前一个非空段落
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            prev_text, prev_index = find_prev_text(index)
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            # 动态查找后一个非空段落
 								            next_text, next_index = find_next_text(index)
 								            # 应用现有的合并逻辑
 								            if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
 								                if not prev_text.endswith(('。', '!', '?')):  # '，', ',',  先注释了，如果逗号，可能还没结束。
 								                    # 检查后一个段落是否为列表项
 								                    if not list_item_pattern.match(next_text) and len(prev_text) > 30:
 								                        # 合并前后段落
 								                        merged_text = prev_text + ' ' + next_text  # 为了可读性添加空格
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								                        if prev_index < len(elements):
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								                            # 移除 processed 中的前一个段落
 								                            if processed and processed[-1] == prev_text:
 								                                processed.pop()
 								                            # 添加合并后的文本
 								                            processed.append(merged_text)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								                        # 跳过标记以及前后所有空白段落，直到 next_index
 								                        index = next_index + 1
 								                        continue  # 继续下一个循环
 								            # 如果不满足合并条件，跳过标记及其周围的空白段落
 								            # 计算下一个需要处理的索引
 								            # 从当前 index 向下，跳过所有连续的空白段落和标记
 								            skip_index = index + 1
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            while skip_index < len(elements):
 								                if isinstance(elements[skip_index], str):
 								                    break
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								                try:
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								                    skip_text = elements[skip_index].text.strip()
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								                except AttributeError:
 								                    skip_index += 1
 								                    continue  # 如果段落对象没有 text 属性，跳过
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								                if skip_text == '' or pattern_marker.search(skip_text):
 								                    skip_index += 1
 								                else:
 								                    break
 								            index = skip_index
 								            continue  # 继续下一个循环
-.13

											
										
										
											2024-11-13 09:41:37 +08:00
+								        # 检查当前段落是否匹配任一排除模式
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								        if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
-.13

											
										
										
											2024-11-13 09:41:37 +08:00
+								            # 如果匹配，则跳过当前段落，不添加到processed列表中
 								            index += 1
 								            continue
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								        # 检查是否为以数字序号开头的段落
 								        match = pattern_numeric_header.match(current_text)
 								        if not match:
 								            match = pattern_numeric_header_fallback.match(current_text)
 								        if match:
 								            # 当前段落以数字序号开头，直接添加到 processed
 								            processed.append(current_text)
 								            flag = True  # 设置标志位，准备处理下一个段落
 								            index += 1
 								            continue
 								        else:
 								            if flag:
 								                if not list_item_pattern.match(current_text):
 								                    if processed:
 								                        # **新增逻辑开始**
 								                        next_non_empty_text, next_non_empty_index = find_next_text(index)
 								                        is_next_numbered = False
 								                        if next_non_empty_text:
 								                            is_next_numbered = bool(
 								                                pattern_numeric_header.match(next_non_empty_text) or
 								                                pattern_numeric_header_fallback.match(next_non_empty_text)
 								                            )
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								                        if is_next_numbered and len(processed[-1]) > 30:
 								                            # 只有在下一个段落以数字序号开头且上一个段落长度大于30时，才将当前段落追加到上一个段落
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								                            processed[-1] = processed[-1] + ' ' + current_text
 								                        else:
 								                            # 否则，不追加，而是作为新的段落添加
 								                            processed.append(current_text)
 								                        # **新增逻辑结束**
 								                else:
 								                    # **新增处理：匹配 list_item_pattern 的段落也应被保存**
 								                    processed.append(current_text)
 								                # 无论是否追加，都将 flag 重置
 								                flag = False
 								                index += 1
 								                continue
 								            else:
 								                # flag 为 False，直接添加到 processed
 								                processed.append(current_text)
 								                index += 1
 								                continue
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
 								    return processed
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keywords):
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								    if isinstance(keywords, str):
 								        keywords = [keywords]
 								    extracted_paragraphs = OrderedDict()
 								    continue_collecting = False
 								    current_section_pattern = None
 								    active_key = None
 								    def match_keywords(text, patterns):
 								        # 首先检查关键词是否匹配
 								        for pattern in patterns:
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            if re.search(pattern, text, re.IGNORECASE):
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                return True
 								        return False
 								    def extract_from_text(text, current_index):
 								        nonlocal continue_collecting, current_section_pattern, active_key
 								        if text == "":
 								            return current_index
 								        if continue_collecting:
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            # 如果是收集状态，并且下面有表格，则把表格内容全部追加到active_key中去
 								            if text == '[$$table_start$$]':
 								                current_index += 1
 								                while (processed_paragraphs[current_index] != '[$$table_over$$]'):
 								                    extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
 								                    current_index += 1
 								                return current_index
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								            if current_section_pattern and re.match(current_section_pattern, text):
 								                continue_collecting = False
 								                active_key = None
 								            else:
 								                if active_key is not None:
 								                    extracted_paragraphs[active_key].append(text)
 								                    return current_index
 								        if match_keywords(text, keywords):
 								            active_key = text
 								            extracted_paragraphs[active_key] = [text]
 								            if match_keywords(text, follow_up_keywords):
 								                continue_collecting = True
 								                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:27 +08:00
+								                if section_number:         #当前匹配的行前有序号,那么就匹配到下个相似序号为止停止收集
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                    current_section_number = section_number.group(1)
 								                    level_count = current_section_number.count('.')
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:27 +08:00
+								                    # 获取章节的各级部分
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                    parts = current_section_number.split('.')
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:27 +08:00
+								                    # Pattern to match current level, e.g., 3.4.5  添加负向前瞻以防止匹配四级或更高层级
 								                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+' + r'(?!\s*[.．]\s*\d+)'
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                    matched_patterns = [pattern]  # start with the full pattern
-.13

											
										
										
											2024-11-13 10:44:37 +08:00
+								                    # for i in range(1, 6):     #同级，与matched_patterns = [pattern]重复了，故注释
 								                    #     # 复制 parts 列表以避免修改原列表
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								                    #     new_parts = parts.    copy()
-.13

											
										
										
											2024-11-13 10:44:37 +08:00
+								                    #     new_parts[-1] = str(int(new_parts[-1]) + i)
 								                    #     # 使用不同的分隔符
 								                    #     next_pattern = r'^' + r'\s*[.．]\s*'.join(new_parts)
 								                    #     matched_patterns.append(next_pattern)
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
 								                    # Parent section (if applicable)
 								                    if len(parts) > 1:
-.13

											
										
										
											2024-11-13 10:44:37 +08:00
+								                        for i in range(1, 6):  #考虑原文档的书写不规范，跳序号的情况，目前设置了范围<5
 								                            parent_section_parts = parts[:-1].copy()
 								                            parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:27 +08:00
+								                            parent_pattern = r'^' + r'\s*[.．]\s*'.join(parent_section_parts)+ r'(?!\s*[.．]\s*\d+)'
-.13

											
										
										
											2024-11-13 10:44:37 +08:00
+								                            matched_patterns.append(parent_pattern)
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
 								                    # 添加对 '数字 、' 格式的支持
 								                    digit_comma_pattern = r'^\d+\s*、'
 								                    matched_patterns.append(digit_comma_pattern)
-.13

											
										
										
											2024-11-13 10:44:37 +08:00
 								                    # 获取当前顶级章节编号
 								                    current_top_level_num = int(current_section_number.split('.')[0])
 								                    for i in range(1, 6):
 								                        next_top_level_num = current_top_level_num + i
 								                        next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.．]'
 								                        # 检查是否已经包含了该模式，避免重复
 								                        if next_top_level_pattern not in matched_patterns:
 								                            matched_patterns.append(next_top_level_pattern)
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                    # Combine the patterns
 								                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
 								                    current_section_pattern = re.compile(combined_pattern)
 								                else:
 								                    found_next_number = False
 								                    current_section_pattern = None
-.16

											
										
										
											2024-12-16 13:56:28 +08:00
+								                    while current_index < len(processed_paragraphs) - 1:
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                        current_index += 1
-.16

											
										
										
											2024-12-16 13:56:28 +08:00
+								                        next_text = processed_paragraphs[current_index].strip()
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								                        # 添加对空白行的处理
 								                        if not next_text:
 								                            continue  # 跳过空白行，进入下一个循环
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                        if not found_next_number:
 								                            # 修改后的正则，支持 '数字 、' 格式
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								                            number_pattern=(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)'     #A1  1.1   abc.123
 								                                            r'|([（(]\s*[一二三四五六七八九十\d]+\s*[）)])'       #(1) （2）
 								                                            r'|([一二三四五六七八九十\d]+\s*、)')              #1、 2、
 								                            next_section_number = re.match(number_pattern,next_text)
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                            if next_section_number:
 								                                found_next_number = True
 								                                if next_section_number.group(1):
 								                                    section_parts = next_section_number.group(1).split('.')
 								                                    dynamic_pattern = r'^' + r'[.．]'.join(
 								                                        [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
 								                                elif next_section_number.group(2):
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								                                    dynamic_pattern = r'^[\(\（]\s*[一二三四五六七八九十\d]+\s*[\)\）]'
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                                elif next_section_number.group(3):
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								                                    dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、'
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								                                current_section_pattern = re.compile(dynamic_pattern)
 								                        if current_section_pattern and re.match(current_section_pattern, next_text):
 								                            extracted_paragraphs[active_key].append(next_text)
 								                        else:
 								                            continue_collecting = False
 								                            active_key = None
 								                            break
 								        return current_index
 								    index = 0
 								    while index < len(processed_paragraphs):
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								        # print(processed_paragraphs[index].strip())
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								        index = extract_from_text(processed_paragraphs[index].strip(), index)
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								        # print("--------------")
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								        index += 1
 								    return extracted_paragraphs
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								# 分割表格中单元格文本
 								def split_cell_text(text):
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
+								    # 定义用于提取括号内容的正则表达式，支持中英文括号
 								    bracket_pattern = re.compile(r'[（(][^（()）)]+[）)]')
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    # 1. 先提取并替换括号内容
 								    bracket_contents = []
 								    def replace_bracket_content(match):
 								        bracket_contents.append(match.group(0))  # 保存括号内容
 								        return f"<BRACKET_{len(bracket_contents) - 1}>"  # 使用占位符替换括号内容
 								    item_with_placeholders = bracket_pattern.sub(replace_bracket_content, text)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    # print("-----------")
 								    # print(text)
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    # 2. 分割句子，保证句子完整性（按标点符号和序号分割）
 								    split_sentences = regex.split(
 								        r'(?<=[。！？!?\?])|'  # 在中文句号、感叹号、问号后分割
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        r'(?<![A-Za-z]\s*)(?<!\d[.．]?)(?=\d+(?:[.．]\d+)+(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|'  # 匹配多级编号，限制后面不能是指定关键字
 								        r'(?<![+\-×÷*/.\．A-Za-z]\s*|\d)(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|'  # 数字后跟空格且不跟指定关键字时分割，且前面不包含 . 或 ． eg:'1.1 xx'
 								        r'(?<![+\-×÷*/.\．A-Za-z]\s*|\d)(?=\d+[、.．](?!\d|(?:\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家])))|'  # 数字后直接跟顿号、点号时分割，且点号后不跟数字  eg:'1.'
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								        r'(?<![A-Za-z])(?=[A-Za-z][.．]\s*(?![A-Za-z]))|'  # 单个字母+点号或单个字母+数字，排除www.baidu.com 网址情况
 								        r'(?=[A-Za-z]+\s*\d+\s*(?:[.．]\s*\d+)*)|'  # 在字母加数字或多级编号前分割
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        r'(?<=^|\n)(?=[一二三四五六七八九十]+、)',  # 在中文数字加顿号（如一、二、）前分割
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								        item_with_placeholders
 								    )
-.12 工程标适配

											
										
										
											2024-11-12 17:12:47 +08:00
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    # 3. 还原括号内容
 								    split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
 								                       split_sentences]
 								    # 4. 过滤空字符串
 								    split_sentences = [s for s in split_sentences if s.strip()]
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    # print(split_sentences)
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    return split_sentences
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								# 文件预处理----按文件顺序提取文本和表格,并合并跨页表格
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								def extract_file_elements(file_path):
 								    doc = Document(file_path)
 								    doc_elements = doc.element.body
 								    doc_paragraphs = doc.paragraphs
 								    doc_tables = doc.tables
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    pre_table_head = None
 								    table_combine = False
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    paragraph_index = 0
 								    tables_index = 0
 								    doc_contents = []
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								    # 遍历文件元素
 								    for element in doc_elements:
 								        # 如果是段落
 								        if element.tag.endswith('}p'):
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            if pre_table_head:
 								                text = doc_paragraphs[paragraph_index].text
 								                # 如果上一个是表格，并且之后没有文本或为跨页标记，则不提取
 								                if (text == '' or pattern_marker.search(text)):
 								                    paragraph_index += 1
 								                    continue
 								                # 如果遇到有效文本，则说明表格提取完毕
 								                else:
 								                    doc_contents.append('[$$table_over$$]')
 								                    table_combine = False
 								                    pre_table_head = None
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            doc_contents.append(doc_paragraphs[paragraph_index])
 								            paragraph_index += 1
 								        # 如果是表格
 								        elif element.tag.endswith('}tbl'):
 								            table = doc_tables[tables_index]
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            table_content = []
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            for row_idx, row in enumerate(table.rows):
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                if row_idx == 0:
 								                    # 跳过表头
 								                    if pre_table_head:
 								                        table_combine = True
 								                        if pre_table_head == row.cells[0].text:
 								                            continue
 								                    # 记录初始表头
 								                    else:
 								                        pre_table_head = row.cells[0].text
 								                        doc_contents.append('[$$table_start$$]')
 								                        continue
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								                # 遍历每一行中的单元格
 								                for cell in row.cells:
 								                    cell_text = cell.text.strip()  # 去除单元格内容前后空白
 								                    if len(cell_text) > 8:  # 检查文字数量是否大于8
 								                        cell_text = split_cell_text(cell_text)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                        table_content += cell_text
 								            # 合并跨页表格
 								            if table_combine:
 								                if not doc_contents[-1].endswith(('。', '!', '?', ';')):
 								                    doc_contents[-1] += ' ' + table_content[0]
 								                    table_content.pop(0)
 								            doc_contents.extend(table_content)
 								            # doc_contents.append('[$$table_over$$]')
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								            tables_index += 1
 								    return doc_contents
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
 								def handle_query(file_path, user_query, output_file, result_key, keywords):
 								    try:
-.16

											
										
										
											2024-12-16 17:39:25 +08:00
+								        excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证：", "我方"]
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								        follow_up_keywords = [
 								            r'情\s*形\s*之\s*一',
 								            r'情\s*况\s*之\s*一',
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            r'下\s*列(?!\s*公式)',       # 增加负向前瞻，排除“下列公式”
 								            r'以\s*下(?!\s*公式)',       # 增加负向前瞻，排除“以下公式”
-.23 无效标废标更新

											
										
										
											2024-12-23 17:29:53 +08:00
+								            r'其\s*他.*?情\s*形\s*[:：]',
 								            r'包\s*括'
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								        ]
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
 								        doc_contents = extract_file_elements(file_path)
 								        processed_paragraphs = preprocess_paragraphs(doc_contents)
 								        extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								        # print(all_texts2)
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        # 1. 得到有序的 all_text1_items
 								        all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
 								        # 2. 得到纯内容列表
 								        all_texts1_list = [content for (_, content) in all_text1_items]
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								        # Proceed only if there is content to write
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        selected_contents = {}
-.17 小bug

											
										
										
											2025-01-17 10:47:43 +08:00
+								        final_list=[f"未解析到'{result_key}'！"]
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:27 +08:00
+								        seen_contents = set()  # 使用集合跟踪已添加的内容以去重
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								        if all_texts1_list or all_texts2:
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            with open(output_file, 'w', encoding='utf-8') as file:
 								                counter = 1
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                for content in all_texts1_list:
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								                    # 使用内容的前25个字符作为去重的依据
 								                    key = content[:25]  # 提取前25个字符
 								                    if key not in seen_contents:  # 如果前30个字符未出现过
 								                        file.write(f"{counter}. {content}\n")
 								                        file.write("..............." + '\n')
 								                        seen_contents.add(key)  # 标记前30个字符为已写入
 								                        counter += 1
 								            # 生成用户查询
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								            if not all_texts1_list:
 								                num_list=[]
 								            else:
 								                user_query = generate_full_user_query(output_file, user_query)
 								                model_ans = qianwen_plus(user_query)  # 豆包模型返回结果
 								                # file_id = upload_file(output_file)
 								                # model_ans = qianwen_long(file_id, user_query)
 								                num_list = process_string_list(model_ans)  # 处理模型返回的序号
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            print(result_key + "选中的序号:" + str(num_list))
 								            for index in num_list:
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                if 1 <= index <= len(all_texts1_list):
 								                    original_global_idx = all_text1_items[index - 1][0]
 								                    content = all_text1_items[index - 1][1]
 								                    selected_contents[original_global_idx] = content
 								            # 把选中的 all_text1 内容 + all_text2 合并
 								            merged_dict = {}
 								            # 先合并用户选中的 all_text1
 								            for idx, txt in selected_contents.items():
 								                merged_dict[idx] = txt
 								            # 再合并 all_text2
 								            for idx, txt in all_texts2.items():
 								                merged_dict[idx] = txt
 								            final_list = [txt for idx, txt in sorted(merged_dict.items(), key=lambda x: x[0])]
 								        return {result_key: final_list}
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								    except Exception as e:
 								        print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
-.17 小bug

											
										
										
											2025-01-17 10:47:43 +08:00
+								        return {result_key: [f"未解析到'{result_key}'！"]}
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								def combine_find_invalid(invalid_docpath, output_dir):
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								    os.makedirs(output_dir, exist_ok=True)
 								    queries = [
 								        (
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            r'否\s*决|'
 								            r'无\s*效\s*投\s*标|'
 								            r'无\s*效\s*文\s*件|'
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            r'(?:文\s*件|投\s*标|响\s*应)\s*[\u4e00-\u9fa5]?\s*(?:无|失)\s*效|'
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            r'无\s*效\s*响\s*应|'
 								            r'无\s*效\s*报\s*价|'
 								            r'无\s*效\s*标|'
 								            r'视\s*为\s*无\s*效|'
 								            r'被\s*拒\s*绝|'
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            r'将\s*拒\s*绝|'
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            r'予\s*以\s*拒\s*绝',
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            """以下是从招标文件中摘取的内容，文本中序号分明，各信息之间以...............分割。
 								任务目标：
 								从文本中筛选所有描述否决投标，拒绝投标，投标、响应无效或类似表述的情况，并返回对应的序号。
-.23 无效标废标更新

											
										
										
											2024-12-23 17:29:53 +08:00
+								要求与指南：
 								    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								输出格式：
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								    以 [x, x, x] 的形式返回，x 为符合条件的信息的序号，为自然数。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								    如果文本中没有符合条件的信息，请返回 []。
 								特殊情况：
 								    如果某序号的内容明显分为几部分且一部分内容符合筛选条件，但其他部分明显是无关内容，请返回符合部分的字符串内容代替序号。
 								示例输出,仅供格式参考：
 								    [1,3,4,6]
-.23 无效标废标更新

											
										
										
											2024-12-23 17:29:53 +08:00
+								文本内容：{full_text}
-.17 修复了提取货物标的Bug，无效投标bug修复

											
										
										
											2024-11-17 17:27:05 +08:00
+								            """,
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								            os.path.join(output_dir, "temp1.txt"),
 								            "否决和无效投标情形"
 								        ),
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
+								        (
 								            r'废\s*标',
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            """以下是从招标文件中摘取的内容，文本中序号分明，文本内之间的信息以'...............'分割。
 								任务目标：
 								请根据以下内容，筛选出 废标项的情况 （明确描述导致 废标 的情况）并返回对应的序号。
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
+								要求与指南：
 								    文本中可能存在无关的信息，请准确筛选符合条件的信息，并将符合条件的信息的序号返回。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								输出格式：
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的信息的序号，为自然数。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								    如果文本中没有任何符合条件的废标情况，请返回 []。
 								示例输出,仅供格式参考：
 								    [1,3,4,6]
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
+								文本内容：{full_text}
 								            """,
 								            os.path.join(output_dir, "temp2.txt"),
 								            "废标项"
 								        ),
 								        (
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								            r'不\s*得(?!\s*(分|力))|禁\s*止\s*投\s*标',
 								            """以下是从招标文件中摘取的内容，文本中序号分明，文本内的条款以'...............'分割。条款规定了各方不得存在的情形。请根据以下要求进行筛选：
 								**投标相关主体与非投标相关主体的定义**：
 								    投标相关主体：包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
 								    非投标相关主体：包括但不限于“招标人”、“采购人”、“评标委员会”或其他描述非投标方的词语。
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
+								**筛选要求**：
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+. **仅筛选**明确描述投标相关主体禁止情形或不得存在的情形的条款，不包含笼统或未具体说明情形的条款。例如：
 								    若条款内容包含'投标人不得存在的其他关联情形'这样的笼统描述，而未说明具体的情形，则无需添加该条款。
 . **排除**仅描述非投标相关主体行为限制或禁止情形的条款，例如“招标人不得泄露信息”或“评标委员会不得收受贿赂”，则无需返回。
 . 若条款同时描述了对投标相关主体与非投标相关主体的行为限制、禁止情形，也需返回。
 . **特殊情况**：如果条款中包含“磋商小组”、”各方“等既能指代投标相关主体又能指代非投标相关主体的词汇：
 								    若在语境中其指代或包含投标相关主体，则应将其考虑在内；否则，排除该条款。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
 								**输出格式**：
-.7 商务要求标题带星保留+评分项手动计算总分

											
										
										
											2025-01-07 17:35:11 +08:00
+								    返回结果以 [x, x, x] 的形式，其中 x 为符合条件的条款的序号，为自然数。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								    如果没有符合条件的条款，返回 `[]`。
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
+								**示例**：
 								- **符合条件**：
 								  - `1. 投标人不得...` → 包含，返回序号 1。
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								  - `3. 联合体投标各方不得...` → 包含，返回序号 3。
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
+								- **不符合条件**：
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								  - `2. 采购人不得...` → 主语为“采购人”，排除。
 								-示例输出: [1,3]
 								请根据上述筛选要求，阅读以下文本内容，并返回符合条件的条款序号，
-.24 禅道bug修改

											
										
										
											2024-12-24 10:21:57 +08:00
 								文本内容：{full_text}
 								            """,
 								            os.path.join(output_dir, "temp3.txt"),
 								            "不得存在的情形"
 								        )
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								    ]
 								    results = []
 								    # 使用线程池来并行处理查询
 								    with ThreadPoolExecutor() as executor:
 								        futures = []
 								        for keywords, user_query, output_file, result_key in queries:
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								            future = executor.submit(handle_query, invalid_docpath, user_query, output_file, result_key, keywords)
-.13 无效投标

											
										
										
											2024-11-13 11:46:13 +08:00
+								            futures.append((future, result_key))  # 保持顺序
 								            time.sleep(0.5)  # 暂停0.5秒后再提交下一个任务
 								        for future, result_key in futures:
 								            try:
 								                result = future.result()
 								            except Exception as e:
 								                print(f"线程处理 {result_key} 时出错: {e}")
 								                result = {result_key: ""}
 								            results.append(result)
 								    combined_dict = {}
 								    for d in results:
 								        combined_dict.update(d)
 								    print("无效标与废标done...")
 								    return {"无效标与废标项": combined_dict}
 								if __name__ == '__main__':
 								    start_time = time.time()
 								    # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
 								    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件（实高电子显示屏）_tobidders_notice_part1.docx"
 								    # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
 								    # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								    # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								    pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件（N510113202400010820250102001）.pdf'
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								    output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
-.30 废标项

											
										
										
											2024-12-30 10:11:27 +08:00
+								    # invalid_added = insert_mark(pdf_path)
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								    # # invalid_added_docx = pdf2docx(invalid_added)
-.17 小bug

											
										
										
											2025-01-17 17:36:46 +08:00
+								    invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								    results = combine_find_invalid(invalid_added_docx, output_dir)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    end_time = time.time()
 								    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								    # # print("Elapsed time:", str(end_time - start_time))