zbparse/flask_app/main/禁止投标情形.py
2024-08-29 16:37:09 +08:00

151 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import ast
import json
import os
import re
from PyPDF2 import PdfWriter, PdfReader
from 通义千问long import upload_file, qianwen_long
from json_utils import clean_json_string
def extract_and_format_from_paths(json_paths, includes):
"""
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
参数:
json_paths (list): 包含多个 JSON 文件路径的列表。
includes (list): 包含要检查的关键词的列表。
返回:
list: 包含所有文件中满足条件的格式化字符串列表。
"""
all_formatted_results = []
# 遍历每个文件路径
for path in json_paths:
try:
with open(path, 'r', encoding='utf-8') as file:
# 加载 JSON 数据
json_data = json.load(file)
formatted_results = []
# 遍历 JSON 数据的每个键值对
for key, value in json_data.items():
if isinstance(value, dict):
# 如果值是字典,检查嵌套字典的每个键值对
for sub_key, sub_value in value.items():
if any(include in sub_value for include in includes):
# 如果子值包含关键词,格式化并添加到结果列表
formatted_results.append(f"{sub_key}: {sub_value}")
elif isinstance(value, str):
# 如果值是字符串,直接检查是否包含关键词
if any(include in value for include in includes):
# 如果值包含关键词,添加到结果列表
formatted_results.append(value)
# 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results)
except FileNotFoundError:
print(f"Error: The file '{path}' does not exist.")
except json.JSONDecodeError:
print(f"Error: The file '{path}' contains invalid JSON.")
return all_formatted_results
def extract_unique_items_from_texts(texts):
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\))\s*')
intro_pattern = re.compile(r'^.*[:]')
punctuation_pattern = re.compile(r'[;。,、..,:;!?]+$')
all_results = []
seen = set()
for text in texts:
text = intro_pattern.sub('', text)
items = pattern.split(text)
for item in items:
cleaned_item = item.strip()
if cleaned_item:
cleaned_item = pattern.sub('', cleaned_item)
cleaned_item = punctuation_pattern.sub('', cleaned_item)
cleaned_item = cleaned_item.strip()
if cleaned_item and cleaned_item not in seen:
seen.add(cleaned_item)
all_results.append(cleaned_item)
return all_results
def merge_pdfs(paths, output_filename):
pdf_writer = PdfWriter()
output_path = None
for path in paths:
pdf_reader = PdfReader(path)
for page in range(len(pdf_reader.pages)):
# 将每页添加到写入对象中
pdf_writer.add_page(pdf_reader.pages[page])
if output_path is None:
# 确定输出文件路径
output_path = os.path.join(os.path.dirname(path), output_filename)
# 写入合并的PDF到文件
if output_path:
with open(output_path, 'wb') as out:
pdf_writer.write(out)
print(f"Merged PDF saved to {output_path}")
else:
print("No files to merge.")
return output_path
def process_string_list(string_list):
# 使用正则表达式匹配方括号内的内容
match = re.search(r'\[(.*?)\]', string_list)
if match:
# 获取匹配的内容,即方括号内的部分
content_inside_brackets = match.group(1)
if content_inside_brackets: # 检查内容是否为空
# 将每个项目用引号包裹,并确保适当的空格和逗号处理
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
return [] # 直接返回空列表如果内容为空
# 使用 ast.literal_eval 来解析格式化后的字符串
try:
actual_list = ast.literal_eval(formatted_list)
return actual_list
except SyntaxError as e:
print(f"Error parsing list: {e}")
return []
else:
# 如果没有匹配到内容,返回空列表
return []
def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须知前附表 条款 评分前附表和资格审查表中
# output_filename="merged.pdf"
# paths=[truncate1,truncate4]
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
file_id=upload_file(truncate4)
#user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些请按json列表格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
print(qianwen_forbidden_str)
actual_list=process_string_list(qianwen_forbidden_str)
print(actual_list)
includes = ["不得存在", "禁止投标"]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
processed_results = extract_unique_items_from_texts(forbidden_results)
print(processed_results)
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
return forbidden_dict
if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate4 = "C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
find_forbidden(truncate_json_path,clause_path,truncate4)