zbparse/flask_app/main/无效标和废标和禁止投标整合.py

# -*- coding: utf-8 -*-
import json
import os.path
import time
import re
from json_utils import combine_json_results, nest_json_under_key
from 通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
from 禁止投标情形 import find_forbidden

#如果当前段落有序号，则向下匹配直接遇到相同的序号样式
#如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
    from collections import OrderedDict
    from docx import Document
    import re

    doc = Document(doc_path)
    extracted_paragraphs = OrderedDict()  # 使用OrderedDict保持顺序
    continue_collecting = False
    current_section_pattern = None
    active_key = None  # 用来标记当前正在收集的文本块的键

    def match_keywords(text, patterns):
        """使用正则表达式匹配关键词。"""
        return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)

    def extract_from_text(text, index):
        nonlocal continue_collecting, current_section_pattern, active_key
        if text == "":  # Skip empty lines
            return
        if continue_collecting:
            if current_section_pattern and re.match(current_section_pattern, text):
                continue_collecting = False
                active_key = None  # 结束当前的收集
            else:
                if active_key is not None:
                    extracted_paragraphs[active_key].append(text)

        if match_keywords(text, keywords):
            active_key = text  # 设置当前的关键词块
            extracted_paragraphs[active_key] = [text]  # 初始化列表以收集文本，包括当前匹配的文本
            # 检查是否也匹配后续关键词
            if match_keywords(text, follow_up_keywords):
                continue_collecting = True
                # 设置跟踪模式
                section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
                if section_number:
                    current_section_number = section_number.group(1)
                    level_count = current_section_number.count('.')
                    pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b'
                    current_section_pattern = re.compile(pattern)
                else:
                    found_next_number = False
                    current_section_pattern = None

                    for next_index in range(index + 1, len(doc.paragraphs)):
                        next_text = doc.paragraphs[next_index].text.strip()
                        if not found_next_number:
                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
                            if next_section_number:
                                found_next_number = True
                                section_parts = next_section_number.group(1).split('.')
                                dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
                                current_section_pattern = re.compile(dynamic_pattern)

                        if current_section_pattern and re.match(current_section_pattern, next_text):
                            extracted_paragraphs[active_key].append(next_text)  # 持续收集

    for index, para in enumerate(doc.paragraphs):
        extract_from_text(para.text.strip(), index)

    return extracted_paragraphs  # 返回字典，键是关键词，值是相关文本列表

def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达式提取到的东西格式化
    all_texts1 = []
    all_texts2=[]
    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
    split_pattern = r'(?<=[。！？\!\?])'

    for key, text_list in extracted_contents.items():
        if len(text_list) == 1:
            for data in text_list:
                # 检查是否包含任何需要排除的字符串
                if any(exclude in data for exclude in excludes):
                    continue  # 如果包含任何排除字符串，跳过这个数据
                # 去掉开头的序号，包括字母+数字的格式 以及括号+数字
                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
                data = re.sub(pattern, '', data).strip()
                keyword_match = re.search(keywords, data)
                if keyword_match:
                    # 从关键词位置开始查找结束标点符号
                    start_pos = keyword_match.start()
                    # 截取从关键词开始到后面的内容
                    substring = data[start_pos:]
                    # 按定义的结束标点分割
                    sentences = re.split(split_pattern, substring, 1)
                    if len(sentences) > 0 and sentences[0]:
                        # 只取第一句，保留标点
                        cleaned_text = data[:start_pos] + sentences[0]
                    else:
                        cleaned_text = data  # 如果没有标点，使用整个字符串
                else:
                    # 如果没有找到关键词，保留原文本
                    cleaned_text = data
                all_texts1.append(cleaned_text)  # 将处理后的文本添加到结果列表

        else:
            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
            data = re.sub(pattern, '', text_list[0]).strip()
            # 将修改后的第一个元素和剩余的元素连接起来
            text_list[0] = data  # 更新列表中的第一个元素
            joined_text = "\n".join(text_list)  # 如果列表中有多个元素，则连接它们
            all_texts2.append(joined_text)  # 将每个列表的内容添加到 all_texts 中

    return all_texts1,all_texts2
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
    """递归查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
    sentences1 = []  # 保存没有后续关键词的情况
    sentences2 = []  # 保存有后续关键词的情况

    if isinstance(data, dict):
        for value in data.values():
            result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
            sentences1.extend(result1)
            sentences2.extend(result2)
    elif isinstance(data, list):
        for item in data:
            result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
            sentences1.extend(result1)
            sentences2.extend(result2)
    elif isinstance(data, str):
        # 分割句子，保证句子完整性
        split_sentences = re.split(r'(?<=[。！？\!\?])', data)
        i = 0
        while i < len(split_sentences):
            sentence = split_sentences[i]
            if re.search(keywords, sentence, re.IGNORECASE):
                follow_up_present = any(
                    re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
                if follow_up_present:
                    # 如果存在后续关键词，则从当前位置开始截取
                    start_index = i
                    end_index = start_index
                    found_next_section = False
                    for j in range(start_index + 1, len(split_sentences)):
                        if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
                            end_index = j
                            found_next_section = True
                            break

                    if found_next_section:
                        full_text = ' '.join(split_sentences[start_index:end_index]).strip()
                    else:
                        full_text = ' '.join(split_sentences[start_index:]).strip()
                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
                    data=re.sub(pattern,'',full_text)
                    sentences2.append(data)  # 存储有后续关键词的情况
                    i = end_index if found_next_section else len(split_sentences)
                else:
                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
                    data = re.sub(pattern, '', sentence).replace('\n','').strip()
                    sentences1.append(data)  # 存储没有后续关键词的情况
                    i += 1
            else:
                i += 1

    return sentences1, sentences2  # 返回两个列表

def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    """从JSON数据中提取包含关键词的句子。"""
    return find_sentences_with_keywords(data, keywords,follow_up_keywords)

#处理无效投标
def extract_values_if_contains(data, includes):
    """
    递归检查字典中的值是否包含列表 'includes' 中的内容。
    如果包含，将这些值添加到一个列表中并返回。

    参数:
    data (dict): 字典或从 JSON 解析得到的数据。
    includes (list): 包含要检查的关键词的列表。

    返回:
    list: 包含满足条件的值的列表。
    """
    included_values = []  # 初始化结果列表

    # 定义递归函数来处理嵌套字典
    def recursive_search(current_data):
        if isinstance(current_data, dict):
            for key, value in current_data.items():
                if isinstance(value, dict):
                    # 如果值是字典，递归搜索
                    recursive_search(value)
                elif isinstance(value, str):
                    # 如果值是字符串，检查是否包含任何 includes 中的关键词
                    if any(include in value for include in includes):
                        included_values.append(value)
        elif isinstance(current_data, list):
            for item in current_data:
                # 如果是列表，递归每个元素
                recursive_search(item)

    # 开始递归搜索
    recursive_search(data)

    return included_values


#你是一个文本助手，文本内的信息以'...............'分割，你负责准确筛选所需的信息并返回，每块信息要求完整，不遗漏，你不得擅自进行总结或删减。
#以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号。
#以上是原文内容，文本内的信息以'...............'分割，请你根据该信息回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我，键名为'否决和无效投标情形'，你的回答完全忠于原文内容，且回答内容与原文内容一致，要求完整与准确，不能擅自总结或者概括。",

def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
    excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注：", "本人保证："]
    follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
    extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
    all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
    all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
    qianwen_txt = all_texts1 + all_tables1
    # Proceed only if there is content to write
    if qianwen_txt:
        with open(output_file, 'w', encoding='utf-8') as file:
            # 初始化一个计数器
            counter = 1
            for content in qianwen_txt:
                file.write("..............."+'\n')
                # 写入内容前加上序号，后面接一个点和空格，然后是内容
                file.write(f"{counter}. {content}\n")
                # 更新计数器，每次循环递增
                counter += 1
        file_id = upload_file(output_file)
        print("starting qianwen-long...")
        qianwen_ans = qianwen_long(file_id, user_query)
        selected_contents = []
        num_list = json.loads(qianwen_ans)
        print(num_list)
        for index in num_list:
            if index - 1 < len(qianwen_txt):
                content = qianwen_txt[index - 1]  # 转换序号为索引（假设序号从1开始）
                selected_contents.append(content)
        selected_contents += all_texts2
        selected_contents += all_tables2
        # 创建一个字典来保存结果
        res = {result_key: selected_contents}
        # 将结果转换为JSON字符串
        # os.remove(output_file)  # Remove the file after use
        # print(f"Deleted temporary file: {output_file}")
    else:
        res = {result_key: ""}  # Set the response to empty if no contents were extracted
    return res

def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate4):
    print("starting无效标与废标...")
    queries = [
        (r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
         "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号。",
         os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
        (r'废\s*标',
         "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号。",
         os.path.join(output_dir, "temp2.txt"), "废标项")
    ]
    results = []

    # 使用线程池来并行处理查询
    with ThreadPoolExecutor() as executor:
        futures = []
        for keywords, user_query, output_file, result_key in queries:
            future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,
                                     truncate_json_path)
            futures.append(future)
            time.sleep(1)  # 暂停1秒后再提交下一个任务

        for future in futures:
            results.append(future.result())

    forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate4)
    results.append(forbidden_res)

    combined_dict = {}
    for d in results:
        combined_dict.update(d)

    print("无效标与废标done...")
    return nest_json_under_key(combined_dict, "无效标与废标项")


#TODO:1.运行时间约80s，如果成为短板需要优化多线程  2.没有提取评标办法前附表中的表格 3.提取表格时根据中文的句号分割 4.qianwen-long存在bug
if __name__ == '__main__':
    start_time = time.time()
    truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
    clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
    truncate4="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
    output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
    doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
    results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate4)
    end_time = time.time()
    print("Elapsed time:", str(end_time - start_time))
    print("Results:", results)