zbparse/flask_app/main/禁止投标情形.py

189 lines
9.3 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
import ast
import json
import os
import re
from PyPDF2 import PdfWriter, PdfReader
2024-08-29 17:30:49 +08:00
from flask_app.main.通义千问long import upload_file, qianwen_long
2024-08-29 16:37:09 +08:00
def extract_and_format_from_paths(json_paths, includes, excludes):
2024-08-29 16:37:09 +08:00
"""
从多个 JSON 文件路径读取数据提取包含特定关键词的内容并按照格式要求合并
参数:
json_paths (list): 包含多个 JSON 文件路径的列表
includes (list): 包含要检查的关键词的列表
excludes (list): 包含要排除的关键词的列表
2024-08-29 16:37:09 +08:00
返回:
list: 包含所有文件中满足条件的格式化字符串列表
"""
all_formatted_results = []
# 遍历每个文件路径
for path in json_paths:
try:
with open(path, 'r', encoding='utf-8') as file:
# 加载 JSON 数据
json_data = json.load(file)
formatted_results = []
# 遍历 JSON 数据的每个键值对
for key, value in json_data.items():
if isinstance(value, dict):
# 如果值是字典,检查嵌套字典的每个键值对
for sub_key, sub_value in value.items():
if any(include in sub_key for include in includes):
2024-08-29 16:37:09 +08:00
# 如果子值包含关键词,格式化并添加到结果列表
formatted_results.append(f"{sub_value}")
elif isinstance(value, str): # clause
# 检查是否包含任何 include 关键词
for include in includes:
if include in value:
# 找到 include 之前的内容
prefix = value.split(include)[0]
# 检查 prefix 是否不包含任何 exclude 关键词
if not any(exclude in prefix for exclude in excludes):
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
if '\n' in value:
value = value.split('\n', 1)[-1]
formatted_results.append(value)
break # 找到一个符合条件的就跳出循环
2024-08-29 16:37:09 +08:00
# 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results)
except FileNotFoundError:
2024-09-13 15:03:55 +08:00
print(f"禁止投标情形: Error: The file '{path}' does not exist.")
2024-08-29 16:37:09 +08:00
except json.JSONDecodeError:
2024-09-13 15:03:55 +08:00
print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
2024-08-29 16:37:09 +08:00
return all_formatted_results
def extract_unique_items_from_texts(texts):
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
intro_pattern = re.compile(r'^.*?[:]')
2024-08-29 16:37:09 +08:00
punctuation_pattern = re.compile(r'[;。,、..,:;!?]+$')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
2024-08-29 16:37:09 +08:00
all_results = []
seen = set()
for text in texts:
# 去除文本中的制表符和换行符
text = text.replace('\t', '').replace('\n', '')
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
2024-08-29 16:37:09 +08:00
text = intro_pattern.sub('', text)
# 替换URL为占位符并保存URL以便后续还原
urls = []
def url_replacer(match):
urls.append(match.group(0))
return f"{{URL{len(urls)}}}"
text = url_pattern.sub(url_replacer, text)
# 使用数字和括号的模式分割文本
2024-08-29 16:37:09 +08:00
items = pattern.split(text)
for item in items:
cleaned_item = item.strip()
if cleaned_item:
# 进一步清理每个条目
2024-08-29 16:37:09 +08:00
cleaned_item = pattern.sub('', cleaned_item)
cleaned_item = punctuation_pattern.sub('', cleaned_item)
cleaned_item = cleaned_item.strip()
# 还原URL
for i, url in enumerate(urls, 1):
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
# 添加未见过的独特条目确保它包含足够的实质内容并长度大于3个字符
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
2024-08-29 16:37:09 +08:00
seen.add(cleaned_item)
all_results.append(cleaned_item)
return all_results
def merge_pdfs(paths, output_filename):
pdf_writer = PdfWriter()
output_path = None
for path in paths:
pdf_reader = PdfReader(path)
for page in range(len(pdf_reader.pages)):
# 将每页添加到写入对象中
pdf_writer.add_page(pdf_reader.pages[page])
if output_path is None:
# 确定输出文件路径
output_path = os.path.join(os.path.dirname(path), output_filename)
# 写入合并的PDF到文件
if output_path:
with open(output_path, 'wb') as out:
pdf_writer.write(out)
2024-09-13 15:03:55 +08:00
print(f"禁止投标情形: Merged PDF saved to {output_path}")
2024-08-29 16:37:09 +08:00
else:
2024-09-13 15:03:55 +08:00
print("禁止投标情形: No files to merge.")
2024-08-29 16:37:09 +08:00
return output_path
def process_string_list(string_list):
# 使用正则表达式匹配方括号内的内容
match = re.search(r'\[(.*?)\]', string_list)
if match:
# 获取匹配的内容,即方括号内的部分
content_inside_brackets = match.group(1)
if content_inside_brackets: # 检查内容是否为空
2024-09-03 09:36:18 +08:00
# 检查内容是否是数字列表
if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
# 如果是数字,不用加引号,直接保留数字
formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
# 如果不全是数字,按字符串处理
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
2024-08-29 16:37:09 +08:00
else:
return [] # 直接返回空列表如果内容为空
2024-09-03 09:36:18 +08:00
2024-08-29 16:37:09 +08:00
# 使用 ast.literal_eval 来解析格式化后的字符串
try:
actual_list = ast.literal_eval(formatted_list)
return actual_list
except SyntaxError as e:
2024-09-13 15:03:55 +08:00
print(f"禁止投标情形: Error parsing list: {e}")
2024-08-29 16:37:09 +08:00
return []
else:
# 如果没有匹配到内容,返回空列表
return []
def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须知前附表 条款 评分前附表和资格审查表中
2024-08-29 16:37:09 +08:00
# output_filename="merged.pdf"
# paths=[truncate1,truncate4]
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
if truncate3:
file_id=upload_file(truncate3)
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些请按json列表格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
else:
qianwen_forbidden_str="[]"
2024-09-03 09:36:18 +08:00
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
excludes = ["招标", "评标", "定标"]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes,excludes)
2024-08-29 16:37:09 +08:00
processed_results = extract_unique_items_from_texts(forbidden_results)
2024-09-03 09:36:18 +08:00
# print(processed_results)
2024-08-29 16:37:09 +08:00
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
return forbidden_dict
#TODO:不得存在的情况文中有很多内容
2024-08-29 16:37:09 +08:00
if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
truncate3 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e"
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx'
res = find_forbidden(truncate_json_path, clause_path, truncate3)
print(json.dumps(res, ensure_ascii=False, indent=4))