zbparse/flask_app/main/禁止投标情形.py

import ast
import json
import os
import re

from PyPDF2 import PdfWriter, PdfReader

from flask_app.main.通义千问long import upload_file, qianwen_long

def extract_and_format_from_paths(json_paths, includes):
    """
    从多个 JSON 文件路径读取数据，提取包含特定关键词的内容，并按照格式要求合并。

    参数:
    json_paths (list): 包含多个 JSON 文件路径的列表。
    includes (list): 包含要检查的关键词的列表。

    返回:
    list: 包含所有文件中满足条件的格式化字符串列表。
    """
    all_formatted_results = []

    # 遍历每个文件路径
    for path in json_paths:
        try:
            with open(path, 'r', encoding='utf-8') as file:
                # 加载 JSON 数据
                json_data = json.load(file)
                formatted_results = []

                # 遍历 JSON 数据的每个键值对
                for key, value in json_data.items():
                    if isinstance(value, dict):
                        # 如果值是字典，检查嵌套字典的每个键值对
                        for sub_key, sub_value in value.items():
                            if any(include in sub_value for include in includes):
                                # 如果子值包含关键词，格式化并添加到结果列表
                                formatted_results.append(f"{sub_key}: {sub_value}")
                    elif isinstance(value, str):
                        # 如果值是字符串，直接检查是否包含关键词
                        if any(include in value for include in includes):
                            # 如果值包含关键词，添加到结果列表
                            formatted_results.append(value)

                # 将当前文件的结果添加到总结果列表
                all_formatted_results.extend(formatted_results)
        except FileNotFoundError:
            print(f"Error: The file '{path}' does not exist.")
        except json.JSONDecodeError:
            print(f"Error: The file '{path}' contains invalid JSON.")

    return all_formatted_results

def extract_unique_items_from_texts(texts):
    pattern = re.compile(r'(?:\d+\.|\（\d+\）|\(\d+\)|\d+\))\s*')
    intro_pattern = re.compile(r'^.*[:：]')
    punctuation_pattern = re.compile(r'[；。，、．.,:;!?！？]+$')

    all_results = []
    seen = set()

    for text in texts:
        text = intro_pattern.sub('', text)
        items = pattern.split(text)

        for item in items:
            cleaned_item = item.strip()
            if cleaned_item:
                cleaned_item = pattern.sub('', cleaned_item)
                cleaned_item = punctuation_pattern.sub('', cleaned_item)
                cleaned_item = cleaned_item.strip()
                if cleaned_item and cleaned_item not in seen:
                    seen.add(cleaned_item)
                    all_results.append(cleaned_item)

    return all_results

def merge_pdfs(paths, output_filename):
    pdf_writer = PdfWriter()
    output_path = None

    for path in paths:
        pdf_reader = PdfReader(path)
        for page in range(len(pdf_reader.pages)):
            # 将每页添加到写入对象中
            pdf_writer.add_page(pdf_reader.pages[page])

        if output_path is None:
            # 确定输出文件路径
            output_path = os.path.join(os.path.dirname(path), output_filename)

    # 写入合并的PDF到文件
    if output_path:
        with open(output_path, 'wb') as out:
            pdf_writer.write(out)
        print(f"Merged PDF saved to {output_path}")
    else:
        print("No files to merge.")
    return output_path

def process_string_list(string_list):
    # 使用正则表达式匹配方括号内的内容
    match = re.search(r'\[(.*?)\]', string_list)
    if match:
        # 获取匹配的内容，即方括号内的部分
        content_inside_brackets = match.group(1)
        if content_inside_brackets:  # 检查内容是否为空
            # 检查内容是否是数字列表
            if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
                # 如果是数字，不用加引号，直接保留数字
                formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
            else:
                # 如果不全是数字，按字符串处理
                formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
        else:
            return []  # 直接返回空列表如果内容为空

        # 使用 ast.literal_eval 来解析格式化后的字符串
        try:
            actual_list = ast.literal_eval(formatted_list)
            return actual_list
        except SyntaxError as e:
            print(f"Error parsing list: {e}")
            return []
    else:
        # 如果没有匹配到内容，返回空列表
        return []
def find_forbidden(truncate_json_path,clause_path,truncate3):    #投标人须知前附表 条款 评分前附表和资格审查表中
    # output_filename="merged.pdf"
    # paths=[truncate1,truncate4]
    # merged_filepath=merge_pdfs(paths,output_filename)        #暂时废弃，评分前附表中的在'否决投标'中摘录了。

    file_id=upload_file(truncate3)
    # user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些，请按json列表格式给我提供信息，键名为'不得存在的其他情形'，请你不要回答有关\"信誉要求\"的内容,若文件中未说明，请在键值中填'未知'。"
    user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些，请以列表给我提供信息，形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及，返回[]。"
    qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
    actual_list=process_string_list(qianwen_forbidden_str)    #提取出字符串列表 ["xxx","xx"]

    includes = ["不得存在", "禁止投标"]
    forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
    processed_results = extract_unique_items_from_texts(forbidden_results)
    # print(processed_results)
    merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
    forbidden_dict={'不得存在的其他情形':merged_forbidden_list}

    return forbidden_dict


if __name__ == '__main__':
    truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
    clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
    truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf"
    output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
    doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx'
    find_forbidden(truncate_json_path,clause_path,truncate4)