2024-08-29 16:37:09 +08:00
|
|
|
|
import ast
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
from PyPDF2 import PdfWriter, PdfReader
|
|
|
|
|
|
2024-08-29 17:30:49 +08:00
|
|
|
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
def extract_and_format_from_paths(json_paths, includes):
|
|
|
|
|
"""
|
|
|
|
|
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
json_paths (list): 包含多个 JSON 文件路径的列表。
|
|
|
|
|
includes (list): 包含要检查的关键词的列表。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
list: 包含所有文件中满足条件的格式化字符串列表。
|
|
|
|
|
"""
|
|
|
|
|
all_formatted_results = []
|
|
|
|
|
|
|
|
|
|
# 遍历每个文件路径
|
|
|
|
|
for path in json_paths:
|
|
|
|
|
try:
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as file:
|
|
|
|
|
# 加载 JSON 数据
|
|
|
|
|
json_data = json.load(file)
|
|
|
|
|
formatted_results = []
|
|
|
|
|
|
|
|
|
|
# 遍历 JSON 数据的每个键值对
|
|
|
|
|
for key, value in json_data.items():
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
# 如果值是字典,检查嵌套字典的每个键值对
|
|
|
|
|
for sub_key, sub_value in value.items():
|
|
|
|
|
if any(include in sub_value for include in includes):
|
|
|
|
|
# 如果子值包含关键词,格式化并添加到结果列表
|
|
|
|
|
formatted_results.append(f"{sub_key}: {sub_value}")
|
|
|
|
|
elif isinstance(value, str):
|
|
|
|
|
# 如果值是字符串,直接检查是否包含关键词
|
|
|
|
|
if any(include in value for include in includes):
|
|
|
|
|
# 如果值包含关键词,添加到结果列表
|
|
|
|
|
formatted_results.append(value)
|
|
|
|
|
|
|
|
|
|
# 将当前文件的结果添加到总结果列表
|
|
|
|
|
all_formatted_results.extend(formatted_results)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f"Error: The file '{path}' does not exist.")
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
print(f"Error: The file '{path}' contains invalid JSON.")
|
|
|
|
|
|
|
|
|
|
return all_formatted_results
|
|
|
|
|
|
|
|
|
|
def extract_unique_items_from_texts(texts):
|
|
|
|
|
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\))\s*')
|
|
|
|
|
intro_pattern = re.compile(r'^.*[::]')
|
|
|
|
|
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
|
|
|
|
|
|
|
|
|
all_results = []
|
|
|
|
|
seen = set()
|
|
|
|
|
|
|
|
|
|
for text in texts:
|
|
|
|
|
text = intro_pattern.sub('', text)
|
|
|
|
|
items = pattern.split(text)
|
|
|
|
|
|
|
|
|
|
for item in items:
|
|
|
|
|
cleaned_item = item.strip()
|
|
|
|
|
if cleaned_item:
|
|
|
|
|
cleaned_item = pattern.sub('', cleaned_item)
|
|
|
|
|
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
|
|
|
|
cleaned_item = cleaned_item.strip()
|
|
|
|
|
if cleaned_item and cleaned_item not in seen:
|
|
|
|
|
seen.add(cleaned_item)
|
|
|
|
|
all_results.append(cleaned_item)
|
|
|
|
|
|
|
|
|
|
return all_results
|
|
|
|
|
|
|
|
|
|
def merge_pdfs(paths, output_filename):
|
|
|
|
|
pdf_writer = PdfWriter()
|
|
|
|
|
output_path = None
|
|
|
|
|
|
|
|
|
|
for path in paths:
|
|
|
|
|
pdf_reader = PdfReader(path)
|
|
|
|
|
for page in range(len(pdf_reader.pages)):
|
|
|
|
|
# 将每页添加到写入对象中
|
|
|
|
|
pdf_writer.add_page(pdf_reader.pages[page])
|
|
|
|
|
|
|
|
|
|
if output_path is None:
|
|
|
|
|
# 确定输出文件路径
|
|
|
|
|
output_path = os.path.join(os.path.dirname(path), output_filename)
|
|
|
|
|
|
|
|
|
|
# 写入合并的PDF到文件
|
|
|
|
|
if output_path:
|
|
|
|
|
with open(output_path, 'wb') as out:
|
|
|
|
|
pdf_writer.write(out)
|
|
|
|
|
print(f"Merged PDF saved to {output_path}")
|
|
|
|
|
else:
|
|
|
|
|
print("No files to merge.")
|
|
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
|
def process_string_list(string_list):
|
|
|
|
|
# 使用正则表达式匹配方括号内的内容
|
|
|
|
|
match = re.search(r'\[(.*?)\]', string_list)
|
|
|
|
|
if match:
|
|
|
|
|
# 获取匹配的内容,即方括号内的部分
|
|
|
|
|
content_inside_brackets = match.group(1)
|
|
|
|
|
if content_inside_brackets: # 检查内容是否为空
|
2024-09-03 09:36:18 +08:00
|
|
|
|
# 检查内容是否是数字列表
|
|
|
|
|
if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
|
|
|
|
|
# 如果是数字,不用加引号,直接保留数字
|
|
|
|
|
formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
|
|
|
|
else:
|
|
|
|
|
# 如果不全是数字,按字符串处理
|
|
|
|
|
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
2024-08-29 16:37:09 +08:00
|
|
|
|
else:
|
|
|
|
|
return [] # 直接返回空列表如果内容为空
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
# 使用 ast.literal_eval 来解析格式化后的字符串
|
|
|
|
|
try:
|
|
|
|
|
actual_list = ast.literal_eval(formatted_list)
|
|
|
|
|
return actual_list
|
|
|
|
|
except SyntaxError as e:
|
|
|
|
|
print(f"Error parsing list: {e}")
|
|
|
|
|
return []
|
|
|
|
|
else:
|
|
|
|
|
# 如果没有匹配到内容,返回空列表
|
|
|
|
|
return []
|
2024-09-06 15:50:52 +08:00
|
|
|
|
def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须知前附表 条款 评分前附表和资格审查表中
|
2024-08-29 16:37:09 +08:00
|
|
|
|
# output_filename="merged.pdf"
|
|
|
|
|
# paths=[truncate1,truncate4]
|
|
|
|
|
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
2024-09-03 09:36:18 +08:00
|
|
|
|
|
2024-09-06 15:50:52 +08:00
|
|
|
|
file_id=upload_file(truncate3)
|
2024-09-03 09:36:18 +08:00
|
|
|
|
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
2024-08-29 16:37:09 +08:00
|
|
|
|
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
|
|
|
|
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
2024-09-03 09:36:18 +08:00
|
|
|
|
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
includes = ["不得存在", "禁止投标"]
|
|
|
|
|
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
|
|
|
|
|
processed_results = extract_unique_items_from_texts(forbidden_results)
|
2024-09-03 09:36:18 +08:00
|
|
|
|
# print(processed_results)
|
2024-08-29 16:37:09 +08:00
|
|
|
|
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
|
|
|
|
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
|
|
|
|
|
|
|
|
|
|
return forbidden_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2024-09-03 09:36:18 +08:00
|
|
|
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
|
|
|
|
|
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
|
|
|
|
|
truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf"
|
|
|
|
|
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
|
|
|
|
|
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx'
|
2024-08-29 16:37:09 +08:00
|
|
|
|
find_forbidden(truncate_json_path,clause_path,truncate4)
|