11.27 清理file_id
This commit is contained in:
parent
d48d7ec8b2
commit
8078651aad
@ -12,7 +12,7 @@ from flask_app.货物标.基础信息解析main import aggregate_basic_info_good
|
||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
|
||||
from flask_app.main.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
|
||||
from flask_app.general.post_processing import inner_post_processing
|
||||
from flask_app.old_version.基础信息整合 import aggregate_basic_info_engineering
|
||||
from flask_app.old_version.基础信息整合_old import aggregate_basic_info_engineering
|
||||
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
|
@ -8,6 +8,12 @@ client = OpenAI(
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
)
|
||||
|
||||
|
||||
def delete_all_files():
|
||||
"""
|
||||
查询所有文件并删除。
|
||||
"""
|
||||
try:
|
||||
# 获取文件列表
|
||||
file_stk = client.files.list()
|
||||
|
||||
@ -17,9 +23,60 @@ file_data = json.loads(file_stk.model_dump_json())
|
||||
# 提取所有文件的 id
|
||||
file_ids = [file["id"] for file in file_data["data"]]
|
||||
|
||||
# num=len(file_ids)
|
||||
# print(num)
|
||||
# 循环删除每个文件
|
||||
for file_id in file_ids:
|
||||
file_object = client.files.delete(file_id)
|
||||
print(file_object.model_dump_json())
|
||||
print(f"Deleted file with id: {file_id} - {file_object.model_dump_json()}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred while deleting files: {e}")
|
||||
|
||||
|
||||
def read_file_ids(output_folder):
|
||||
file_ids = []
|
||||
file_ids_path = os.path.join(output_folder, 'file_ids.txt')
|
||||
# 检查文件是否存在
|
||||
if os.path.exists(file_ids_path):
|
||||
try:
|
||||
with open(file_ids_path, 'r', encoding='utf-8') as file:
|
||||
# 按行读取文件内容
|
||||
file_ids = [line.strip() for line in file.readlines()]
|
||||
print(f"读取到的文件 ID 列表:{file_ids}")
|
||||
except Exception as e:
|
||||
print(f"读取 file_ids.txt 文件时发生错误: {e}")
|
||||
else:
|
||||
print(f"文件 {file_ids_path} 不存在。")
|
||||
|
||||
return file_ids
|
||||
|
||||
|
||||
def delete_file_by_ids(file_ids):
|
||||
"""
|
||||
根据传入的 file_id 列表删除指定的文件。
|
||||
|
||||
:param file_ids: 一个包含文件 ID 的字符串列表
|
||||
"""
|
||||
if not isinstance(file_ids, list):
|
||||
print("Error: file_ids should be a list.")
|
||||
return
|
||||
|
||||
if not all(isinstance(file_id, str) for file_id in file_ids):
|
||||
print("Error: Each file_id should be a string.")
|
||||
return
|
||||
|
||||
try:
|
||||
# 删除指定文件
|
||||
for file_id in file_ids:
|
||||
try:
|
||||
# 假设 client.files.delete 会返回一个文件对象
|
||||
file_object = client.files.delete(file_id)
|
||||
# print(f"Deleted file with id: {file_id} - {file_object.model_dump_json()}")
|
||||
except Exception as e:
|
||||
# 处理删除单个文件时的异常
|
||||
print(f"Failed to delete file with id {file_id}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred while processing the file_ids: {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
delete_all_files()
|
@ -1,8 +1,10 @@
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import threading
|
||||
from functools import wraps
|
||||
|
||||
from flask import g
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
import random
|
||||
import time
|
||||
@ -10,16 +12,35 @@ from pathlib import Path
|
||||
from openai import OpenAI
|
||||
import os
|
||||
|
||||
def upload_file(file_path):
|
||||
file_write_lock = threading.Lock()
|
||||
def upload_file(file_path,output_folder=""):
|
||||
"""
|
||||
Uploads a file to DashScope and returns the file ID.
|
||||
Additionally, saves the file ID to 'file_ids.txt' in the given output folder.
|
||||
"""
|
||||
if not output_folder:
|
||||
output_folder=os.path.dirname(file_path)
|
||||
client = OpenAI(
|
||||
api_key=os.getenv("DASHSCOPE_API_KEY"),
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
)
|
||||
|
||||
# 上传文件并获取 file_id
|
||||
file = client.files.create(file=Path(file_path), purpose="file-extract")
|
||||
return file.id
|
||||
file_id = file.id
|
||||
|
||||
# 创建output_folder路径,如果它不存在
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
|
||||
# 确保文件写入是互斥的
|
||||
with file_write_lock: # 在这个代码块中,其他线程无法进入
|
||||
file_ids_path = os.path.join(output_folder, 'file_ids.txt')
|
||||
# 如果文件不存在,就创建它并写入 file_id
|
||||
with open(file_ids_path, 'a') as f:
|
||||
f.write(f'{file_id}\n')
|
||||
|
||||
return file_id
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=4, period=1) # 每秒最多调用4次
|
||||
|
@ -1,172 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from PyPDF2 import PdfWriter, PdfReader
|
||||
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.通用功能函数 import process_string_list
|
||||
|
||||
|
||||
def extract_and_format_from_paths(json_paths, includes, excludes):
|
||||
"""
|
||||
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
||||
|
||||
参数:
|
||||
json_paths (list): 包含多个 JSON 文件路径的列表。
|
||||
includes (list): 包含要检查的关键词的列表。
|
||||
excludes (list): 包含要排除的关键词的列表。
|
||||
|
||||
返回:
|
||||
list: 包含所有文件中满足条件的格式化字符串列表。
|
||||
"""
|
||||
all_formatted_results = []
|
||||
|
||||
# 遍历每个文件路径
|
||||
for path in json_paths:
|
||||
# 检查路径是否为空或不存在
|
||||
if not path or not os.path.isfile(path):
|
||||
print(f"禁止投标情形: Error: The file path '{path}' is invalid or does not exist.")
|
||||
continue # 跳过无效路径
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as file:
|
||||
# 加载 JSON 数据
|
||||
json_data = json.load(file)
|
||||
formatted_results = []
|
||||
|
||||
# 遍历 JSON 数据的每个键值对
|
||||
for key, value in json_data.items():
|
||||
if isinstance(value, dict):
|
||||
# 如果值是字典,检查嵌套字典的每个键值对
|
||||
for sub_key, sub_value in value.items():
|
||||
if any(include in sub_key for include in includes):
|
||||
# 如果子值包含关键词,格式化并添加到结果列表
|
||||
formatted_results.append(f"{sub_value}")
|
||||
elif isinstance(value, str): # clause
|
||||
# 检查是否包含任何 include 关键词
|
||||
for include in includes:
|
||||
if include in value:
|
||||
# 找到 include 之前的内容
|
||||
prefix = value.split(include)[0]
|
||||
# 检查 prefix 是否不包含任何 exclude 关键词
|
||||
if not any(exclude in prefix for exclude in excludes):
|
||||
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
|
||||
if '\n' in value:
|
||||
value = value.split('\n', 1)[-1]
|
||||
formatted_results.append(value)
|
||||
break # 找到一个符合条件的就跳出循环
|
||||
|
||||
# 将当前文件的结果添加到总结果列表
|
||||
all_formatted_results.extend(formatted_results)
|
||||
except FileNotFoundError:
|
||||
print(f"禁止投标情形: Error: The file '{path}' does not exist.")
|
||||
except json.JSONDecodeError:
|
||||
print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
|
||||
|
||||
return all_formatted_results
|
||||
|
||||
def extract_unique_items_from_texts(texts):
|
||||
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
|
||||
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
|
||||
intro_pattern = re.compile(r'^.*?[::]')
|
||||
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
||||
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
||||
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
|
||||
|
||||
all_results = []
|
||||
seen = set()
|
||||
|
||||
for text in texts:
|
||||
# 去除文本中的制表符和换行符
|
||||
text = text.replace('\t', '').replace('\n', '')
|
||||
|
||||
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
|
||||
text = intro_pattern.sub('', text)
|
||||
|
||||
# 替换URL为占位符,并保存URL以便后续还原
|
||||
urls = []
|
||||
def url_replacer(match):
|
||||
urls.append(match.group(0))
|
||||
return f"{{URL{len(urls)}}}"
|
||||
text = url_pattern.sub(url_replacer, text)
|
||||
|
||||
# 使用数字和括号的模式分割文本
|
||||
items = pattern.split(text)
|
||||
|
||||
for item in items:
|
||||
cleaned_item = item.strip()
|
||||
if cleaned_item:
|
||||
# 进一步清理每个条目
|
||||
cleaned_item = pattern.sub('', cleaned_item)
|
||||
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
||||
cleaned_item = cleaned_item.strip()
|
||||
|
||||
# 还原URL
|
||||
for i, url in enumerate(urls, 1):
|
||||
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
|
||||
|
||||
# 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符
|
||||
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
|
||||
seen.add(cleaned_item)
|
||||
all_results.append(cleaned_item)
|
||||
|
||||
return all_results
|
||||
|
||||
def merge_pdfs(paths, output_filename):
|
||||
pdf_writer = PdfWriter()
|
||||
output_path = None
|
||||
|
||||
for path in paths:
|
||||
pdf_reader = PdfReader(path)
|
||||
for page in range(len(pdf_reader.pages)):
|
||||
# 将每页添加到写入对象中
|
||||
pdf_writer.add_page(pdf_reader.pages[page])
|
||||
|
||||
if output_path is None:
|
||||
# 确定输出文件路径
|
||||
output_path = os.path.join(os.path.dirname(path), output_filename)
|
||||
|
||||
# 写入合并的PDF到文件
|
||||
if output_path:
|
||||
with open(output_path, 'wb') as out:
|
||||
pdf_writer.write(out)
|
||||
print(f"禁止投标情形: Merged PDF saved to {output_path}")
|
||||
else:
|
||||
print("禁止投标情形: No files to merge.")
|
||||
return output_path
|
||||
|
||||
|
||||
def find_forbidden(qualification=""):
|
||||
try:
|
||||
if qualification:
|
||||
file_id = upload_file(qualification)
|
||||
user_query_forbidden = (
|
||||
"该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],"
|
||||
"请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
||||
)
|
||||
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
||||
else:
|
||||
qianwen_forbidden_str = "[]"
|
||||
|
||||
actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"]
|
||||
includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
|
||||
excludes = ["招标", "评标", "定标"]
|
||||
# forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes)
|
||||
# processed_results = extract_unique_items_from_texts(forbidden_results)
|
||||
# merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
||||
merged_forbidden_list = list(dict.fromkeys(actual_list))
|
||||
forbidden_dict = {'不得存在的其他情形': merged_forbidden_list}
|
||||
return forbidden_dict
|
||||
except Exception as e:
|
||||
print(f"find_forbidden 在处理时发生异常: {e}")
|
||||
return {'不得存在的其他情形': ""}
|
||||
|
||||
#TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。
|
||||
if __name__ == '__main__':
|
||||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json"
|
||||
# clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
|
||||
qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
|
||||
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f"
|
||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx'
|
||||
res = find_forbidden(qualification)
|
||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
@ -1,216 +0,0 @@
|
||||
#基于多线程提问,现已废弃
|
||||
# assistant_id
|
||||
import queue
|
||||
import concurrent.futures
|
||||
from dashscope import Assistants, Messages, Runs, Threads
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
prompt = """
|
||||
# 角色
|
||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||
|
||||
## 技能
|
||||
### 技能 1:文档解析与摘要
|
||||
- 深入理解并分析${document1}的内容,提取关键信息。
|
||||
- 根据需求生成简洁明了的摘要,保持原文核心意义不变。
|
||||
|
||||
### 技能 2:信息检索与关联
|
||||
- 在${document1}中高效检索特定信息或关键词。
|
||||
- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。
|
||||
|
||||
## 限制
|
||||
- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。
|
||||
- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。
|
||||
- 确保所有生成或改编的内容逻辑连贯,无误导性信息。
|
||||
|
||||
请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。
|
||||
"""
|
||||
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}'
|
||||
|
||||
|
||||
|
||||
|
||||
#正文和文档名之间的内容
|
||||
def extract_content_between_tags(text):
|
||||
results = []
|
||||
|
||||
# 使用“【正文】”来分割文本
|
||||
parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容
|
||||
|
||||
for index, part in enumerate(parts):
|
||||
# 查找“【文档名】”标签的位置
|
||||
doc_name_index = part.find('【文档名】')
|
||||
# 查找 'file_ids' 标签的位置
|
||||
file_ids_index = part.find("'file_ids'")
|
||||
|
||||
# 根据是否找到“【文档名】”来决定提取内容的截止点
|
||||
if doc_name_index != -1:
|
||||
end_index = doc_name_index
|
||||
elif file_ids_index != -1:
|
||||
end_index = file_ids_index
|
||||
else:
|
||||
end_index = len(part)
|
||||
|
||||
# 提取内容
|
||||
content = part[:end_index].strip()
|
||||
results.append(content)
|
||||
|
||||
# 如果存在 file_ids,处理最后一部分特别提取 file_ids 前的内容
|
||||
if "'file_ids'" in parts[-1]:
|
||||
file_ids_index = parts[-1].find("'file_ids'")
|
||||
if file_ids_index != -1:
|
||||
last_content = parts[-1][:file_ids_index].strip()
|
||||
results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids
|
||||
|
||||
return results
|
||||
|
||||
def find_references_in_extracted(formatted_ans, extracted_references):
|
||||
results = {} # 用来存储匹配结果的字典
|
||||
|
||||
# 递归函数,用于处理多层嵌套的字典
|
||||
def recurse_through_dict(current_dict, path=[]):
|
||||
for key, value in current_dict.items():
|
||||
# 检查值是否还是字典,如果是,进行递归
|
||||
if isinstance(value, dict):
|
||||
recurse_through_dict(value, path + [key])
|
||||
else:
|
||||
# 特定值处理:如果值为'未知',直接设置索引为-1
|
||||
if value == '未知':
|
||||
results['.'.join(path + [key])] = -1
|
||||
else:
|
||||
# 进行匹配检查
|
||||
found = False
|
||||
for index, reference in enumerate(extracted_references):
|
||||
if str(value) in reference: # 转换为字符串,确保兼容性
|
||||
results['.'.join(path + [key])] = index # 使用点表示法记录路径
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
results['.'.join(path + [key])] = None
|
||||
|
||||
# 从根字典开始递归
|
||||
recurse_through_dict(formatted_ans)
|
||||
return results
|
||||
|
||||
def send_message(assistant, message='百炼是什么?'):
|
||||
ans = []
|
||||
print(f"Query: {message}")
|
||||
|
||||
# create thread.
|
||||
thread = Threads.create()
|
||||
print(thread)
|
||||
|
||||
# create a message.
|
||||
message = Messages.create(thread.id, content=message)
|
||||
# create run
|
||||
|
||||
run = Runs.create(thread.id, assistant_id=assistant.id)
|
||||
print(run)
|
||||
|
||||
# wait for run completed or requires_action
|
||||
run_status = Runs.wait(run.id, thread_id=thread.id)
|
||||
print(run_status)
|
||||
reference_txt = str(run_status)
|
||||
extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list
|
||||
# get the thread messages.
|
||||
msgs = Messages.list(thread.id)
|
||||
for message in msgs['data'][::-1]:
|
||||
ans.append(message['content'][0]['text']['value'])
|
||||
return ans,extracted_references
|
||||
|
||||
def rag_assistant(knowledge_name):
|
||||
retriever = DashScopeCloudRetriever(knowledge_name)
|
||||
pipeline_id = str(retriever.pipeline_id)
|
||||
assistant = Assistants.create(
|
||||
model='qwen-max',
|
||||
name='smart helper',
|
||||
description='智能助手,支持知识库查询和插件调用。',
|
||||
temperature='0.3',
|
||||
instructions=prom,
|
||||
tools=[
|
||||
{
|
||||
"type": "code_interpreter"
|
||||
},
|
||||
{
|
||||
"type": "rag",
|
||||
"prompt_ra": {
|
||||
"pipeline_id": pipeline_id,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query_word": {
|
||||
"type": "str",
|
||||
"value": "${document1}"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
)
|
||||
return assistant
|
||||
|
||||
|
||||
def pure_assistant():
|
||||
assistant = Assistants.create(
|
||||
model='qwen-max',
|
||||
name='smart helper',
|
||||
description='智能助手,能基于用户的要求精准简洁地回答用户的提问',
|
||||
instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问',
|
||||
tools=[
|
||||
{
|
||||
"type": "code_interpreter"
|
||||
},
|
||||
]
|
||||
)
|
||||
return assistant
|
||||
|
||||
def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True):
|
||||
if use_rag:
|
||||
assistant = rag_assistant(knowledge_name)
|
||||
else:
|
||||
assistant = pure_assistant()
|
||||
ans,extracted_references = send_message(assistant, message=question)
|
||||
for index, reference in enumerate(extracted_references, start=0):
|
||||
print(f"{index}. {reference}")
|
||||
formatted_ans=extract_content_from_json(ans[1])
|
||||
print(formatted_ans)
|
||||
|
||||
results = find_references_in_extracted(formatted_ans, extracted_references)
|
||||
for key, index in results.items():
|
||||
print(f"{key}: Found at index {index}")
|
||||
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
||||
|
||||
def multi_threading(queries, knowledge_name, use_rag=True):
|
||||
result_queue = queue.Queue()
|
||||
|
||||
# 使用 ThreadPoolExecutor 管理线程
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||
# 使用字典保存每个提交的任务的Future对象,以便按顺序访问结果
|
||||
future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for
|
||||
index, query in enumerate(queries)}
|
||||
|
||||
# 收集每个线程的结果
|
||||
for future in concurrent.futures.as_completed(future_to_query):
|
||||
index = future_to_query[future]
|
||||
# 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成
|
||||
try:
|
||||
future.result() # 可以用来捕获异常或确认任务完成
|
||||
except Exception as exc:
|
||||
print(f"Query {index} generated an exception: {exc}")
|
||||
|
||||
# 从队列中获取所有结果并按索引排序
|
||||
results = [None] * len(queries)
|
||||
while not result_queue.empty():
|
||||
index, result = result_queue.get()
|
||||
results[index] = result
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 读取问题列表
|
||||
questions = ["该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
|
||||
knowledge_name = "招标解析5word"
|
||||
results = multi_threading(questions, knowledge_name, use_rag=True)
|
||||
# 打印结果
|
||||
for question, response in results:
|
||||
print(f"Question: {question}")
|
||||
print(f"Response: {response}")
|
@ -1,166 +0,0 @@
|
||||
import json
|
||||
|
||||
from flask_app.general.json_utils import clean_json_string, rename_outer_key
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.通用功能函数 import judge_consortium_bidding
|
||||
|
||||
def aggregate_basic_info_engineering(baseinfo_list):
|
||||
"""
|
||||
将基础信息列表中的数据进行合并和分类。
|
||||
|
||||
参数:
|
||||
- baseinfo_list (list): 包含多个基础信息的列表。
|
||||
|
||||
返回:
|
||||
- dict: 合并和分类后的基础信息字典。
|
||||
"""
|
||||
key_groups = {
|
||||
"招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"],
|
||||
"项目信息": ["项目名称", "招标编号", "项目概况", "招标范围", "招标控制价", "投标竞争下浮率"],
|
||||
"关键时间/内容": [
|
||||
"投标文件递交截止日期",
|
||||
"投标文件递交方式",
|
||||
"开标时间",
|
||||
"开标地点",
|
||||
"投标人要求澄清招标文件的截止时间",
|
||||
"投标有效期",
|
||||
"评标结果公示媒介"
|
||||
],
|
||||
"保证金相关": ["质量保证金", "退还投标保证金"],
|
||||
"其他信息": [
|
||||
"重新招标、不再招标和终止招标",
|
||||
"投标费用承担",
|
||||
"招标代理服务费",
|
||||
"是否退还投标文件",
|
||||
]
|
||||
}
|
||||
|
||||
combined_data = {}
|
||||
relevant_keys_detected = set()
|
||||
|
||||
# 合并所有基础信息并收集相关键
|
||||
for baseinfo in baseinfo_list:
|
||||
combined_data.update(baseinfo)
|
||||
relevant_keys_detected.update(baseinfo.keys())
|
||||
|
||||
# 动态调整键组
|
||||
dynamic_key_handling(key_groups, relevant_keys_detected)
|
||||
|
||||
# 创建一个副本以存储未分类的项目
|
||||
unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]}
|
||||
|
||||
# 按键组分类并嵌套
|
||||
for group_name, keys in key_groups.items():
|
||||
group_data = {key: combined_data.get(key, "未提供") for key in keys}
|
||||
combined_data[group_name] = group_data
|
||||
# 从 unclassified_items 中移除已分类的键
|
||||
for key in keys:
|
||||
unclassified_items.pop(key, None)
|
||||
|
||||
# 将剩余未分类的键值对添加到 "其他信息" 组
|
||||
combined_data["其他信息"].update(unclassified_items)
|
||||
|
||||
# 移除顶层的未分类键值对
|
||||
for key in list(combined_data.keys()):
|
||||
if key not in key_groups:
|
||||
del combined_data[key]
|
||||
|
||||
return combined_data
|
||||
|
||||
def dynamic_key_handling(key_groups, detected_keys):
|
||||
# 检查和调整键组配置
|
||||
for key in detected_keys:
|
||||
# 处理“保证金相关”组,插到"质量保证金"前
|
||||
if "保证金" in key:
|
||||
group = key_groups["保证金相关"]
|
||||
insert_before = "质量保证金"
|
||||
if insert_before in group:
|
||||
index = group.index(insert_before)
|
||||
if key not in group: # 避免重复插入
|
||||
group.insert(index, key)
|
||||
else:
|
||||
group.append(key) # 如果没有找到特定键,则追加到末尾
|
||||
elif "联合体" in key:
|
||||
key_groups["项目信息"].append(key)
|
||||
elif "分包" in key:
|
||||
key_groups["项目信息"].append(key)
|
||||
elif "踏勘现场" in key:
|
||||
key_groups["其他信息"].append(key)
|
||||
elif "投标预备会" in key:
|
||||
key_groups["其他信息"].append(key)
|
||||
elif "偏离" in key:
|
||||
key_groups["其他信息"].append(key)
|
||||
|
||||
def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path):
|
||||
"""
|
||||
综合和处理基础信息,生成最终的基础信息字典。
|
||||
|
||||
参数:
|
||||
- knowledge_name (str): 知识名称。
|
||||
- truncate0 (str): 文件路径。
|
||||
- output_folder (str): 输出文件夹路径。
|
||||
- clause_path (str): 条款路径。
|
||||
|
||||
返回:
|
||||
- dict: 综合后的基础信息。
|
||||
"""
|
||||
baseinfo_list = []
|
||||
baseinfo_file_path = 'flask_app/static/提示词/基本信息工程标.txt'
|
||||
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标.txt'
|
||||
questions = read_questions_from_file(baseinfo_file_path)
|
||||
res1 = multi_threading(questions, knowledge_name)
|
||||
|
||||
for index, response in res1:
|
||||
try:
|
||||
if response and len(response) > 1:
|
||||
baseinfo_list.append(clean_json_string(response[1]))
|
||||
else:
|
||||
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.")
|
||||
except Exception as e:
|
||||
print(f"基础信息整合: Error processing response for query index {index}: {e}")
|
||||
|
||||
# 判断是否分包、是否需要递交投标保证金等
|
||||
chosen_numbers, merged = judge_whether_main(truncate0, output_folder)
|
||||
baseinfo_list.append(merged)
|
||||
|
||||
judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
|
||||
# judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt'
|
||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||
judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标
|
||||
|
||||
if judge_consortium:
|
||||
judge_consortium_question = (
|
||||
"该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,"
|
||||
"外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
|
||||
)
|
||||
judge_questions.append(judge_consortium_question)
|
||||
|
||||
file_id = upload_file(truncate0)
|
||||
res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
|
||||
|
||||
if not res2:
|
||||
print("基础信息整合: multi_threading error!")
|
||||
else:
|
||||
for question, response in res2:
|
||||
baseinfo_list.append(clean_json_string(response))
|
||||
|
||||
rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
|
||||
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
|
||||
baseinfo_list.append(update_json)
|
||||
aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list) # 现在是一个字典
|
||||
return {"基础信息": aggregated_baseinfo}
|
||||
|
||||
if __name__ == "__main__":
|
||||
knowledge_name = "ztb"
|
||||
output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405"
|
||||
truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf"
|
||||
clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json"
|
||||
res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path)
|
||||
print(json.dumps(res,ensure_ascii=False,indent=4))
|
||||
|
||||
|
||||
|
||||
|
@ -1,373 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import shutil
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
|
||||
|
||||
def validate_pdf(file_path):
|
||||
""" 验证PDF文件是否损坏 """
|
||||
try:
|
||||
with open(file_path, "rb") as file:
|
||||
pdf = PdfReader(file)
|
||||
return len(pdf.pages) > 0
|
||||
except Exception as e:
|
||||
print(f"Error reading PDF {file_path}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def truncate_pdf(source_path, target_path, max_pages=15):
|
||||
"""截取PDF文件的前15页并保存"""
|
||||
try:
|
||||
with open(source_path, "rb") as infile:
|
||||
reader = PdfReader(infile)
|
||||
writer = PdfWriter()
|
||||
for i in range(min(max_pages, len(reader.pages))):
|
||||
writer.add_page(reader.pages[i])
|
||||
with open(target_path, "wb") as outfile:
|
||||
writer.write(outfile)
|
||||
except Exception as e:
|
||||
print(f"Error processing PDF {source_path}: {str(e)}")
|
||||
|
||||
def copy_file(src, dest):
|
||||
""" 复制单个文件 """
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
shutil.copy2(src, dest)
|
||||
|
||||
def copy_directory(source, destination):
|
||||
"""复制整个目录"""
|
||||
os.makedirs(destination, exist_ok=True)
|
||||
for item in os.listdir(source):
|
||||
src_path = os.path.join(source, item)
|
||||
dest_path = os.path.join(destination, item)
|
||||
if os.path.isfile(src_path):
|
||||
copy_file(src_path, dest_path)
|
||||
else:
|
||||
copy_directory(src_path, dest_path)
|
||||
|
||||
def unique_file_name(base_path, file_name):
|
||||
counter = 1
|
||||
name_part, extension = os.path.splitext(file_name)
|
||||
new_file_name = file_name
|
||||
# 检查文件是否存在,若存在则修改文件名
|
||||
while os.path.exists(os.path.join(base_path, new_file_name)):
|
||||
new_file_name = f"{name_part}_{counter}{extension}"
|
||||
counter += 1
|
||||
return new_file_name
|
||||
|
||||
def process_pdf_folders(source_dir, target_dir):
|
||||
""" 处理源目录中的PDF文件,并基于条件选择目标文件夹 """
|
||||
for root, dirs, files in os.walk(source_dir, topdown=False):
|
||||
for file in files:
|
||||
if file.lower().endswith('.pdf'):
|
||||
source_file_path = os.path.join(root, file)
|
||||
if validate_pdf(source_file_path):
|
||||
relative_path = os.path.relpath(root, source_dir)
|
||||
target_file_dir = os.path.join(target_dir, relative_path)
|
||||
target_file_path = os.path.join(target_file_dir, file)
|
||||
copy_file(source_file_path, target_file_path)
|
||||
else:
|
||||
print(f"Deleted corrupt file: {source_file_path}")
|
||||
# 清除空目录
|
||||
if not os.listdir(root):
|
||||
os.rmdir(root)
|
||||
|
||||
|
||||
def classify_folders(source_dir, target_dir, project_index):
|
||||
"""Classifies folders and processes files based on specific criteria."""
|
||||
temp_dir = os.path.join(source_dir, 'temp')
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
target_project_dir = None
|
||||
processed_dirs = set() # Set to track processed directories
|
||||
|
||||
for subdir in os.listdir(source_dir):
|
||||
subdir_path = os.path.join(source_dir, subdir)
|
||||
if not os.path.isdir(subdir_path):
|
||||
continue
|
||||
|
||||
files = [f.lower() for f in os.listdir(subdir_path)]
|
||||
if 'zb.pdf' in files:
|
||||
target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
|
||||
processed_dirs.add(subdir_path) # Mark this directory as processed
|
||||
elif subdir.lower() == "输出文件":
|
||||
process_evaluation_files(subdir_path, target_project_dir)
|
||||
processed_dirs.add(subdir_path) # Mark this directory as processed
|
||||
|
||||
# Process remaining folders, skipping already processed ones
|
||||
process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
|
||||
#删除tmp目录
|
||||
# if os.path.exists(temp_dir):
|
||||
# shutil.rmtree(temp_dir)
|
||||
if os.path.exists(source_dir):
|
||||
shutil.rmtree(source_dir)
|
||||
|
||||
def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
|
||||
"""Processes tender files and returns the target project directory and updated index."""
|
||||
zb_path = os.path.join(subdir_path, "zb.pdf")
|
||||
truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
|
||||
truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages
|
||||
bot_response = file_parse(truncated_zb_path, project_index, 1)
|
||||
zb_response = extract_answer(bot_response[1], 1)
|
||||
zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠
|
||||
target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
|
||||
|
||||
new_zb_path = os.path.join(subdir_path, target_zb_name)
|
||||
os.rename(zb_path, new_zb_path)
|
||||
|
||||
target_category_dir = os.path.join(target_dir, category)
|
||||
target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
|
||||
copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
|
||||
|
||||
os.remove(truncated_zb_path)
|
||||
shutil.rmtree(subdir_path)
|
||||
return target_project_dir, zb_response[2]
|
||||
|
||||
|
||||
def process_evaluation_files(subdir_path, target_project_dir):
|
||||
"""Processes evaluation folders."""
|
||||
copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
|
||||
shutil.rmtree(subdir_path)
|
||||
|
||||
|
||||
def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
|
||||
"""Processes remaining folders containing bid files."""
|
||||
target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
|
||||
|
||||
for subdir in os.listdir(source_dir):
|
||||
subdir_path = os.path.join(source_dir, subdir)
|
||||
if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
|
||||
continue # Skip processed directories
|
||||
|
||||
target_tbcom_dir = None # Initialize outside the file loop
|
||||
|
||||
for item in os.listdir(subdir_path):
|
||||
item_src_path = os.path.join(subdir_path, item)
|
||||
new_name = "truncate_" + item
|
||||
truncated_tb_path = os.path.join(temp_dir, new_name)
|
||||
truncate_pdf(item_src_path, truncated_tb_path)
|
||||
bot_response = file_parse(truncated_tb_path, project_index, 2)
|
||||
tb_response = extract_answer(bot_response[1], 2)
|
||||
if not tb_response:
|
||||
continue # If there's no response, skip this file
|
||||
|
||||
# Initialize target_tbcom_dir only once based on the first processed file
|
||||
if not target_tbcom_dir:
|
||||
target_tb_name, _ = tb_response
|
||||
if(target_tb_name != target_zb_name and target_tb_name != "未知"):
|
||||
target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
|
||||
print(target_tbcom_dir)
|
||||
os.makedirs(target_tbcom_dir, exist_ok=True)
|
||||
|
||||
tb_section = tb_response[1] + ".pdf"
|
||||
new_tb_path = os.path.join(subdir_path, tb_section)
|
||||
# 获取唯一文件名
|
||||
new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
|
||||
os.rename(item_src_path, new_tb_path)
|
||||
|
||||
# Remove temporary truncated file
|
||||
os.remove(truncated_tb_path)
|
||||
|
||||
# Copy the whole directory at once after all files have been processed and renamed
|
||||
if target_tbcom_dir:
|
||||
copy_directory(subdir_path, target_tbcom_dir)
|
||||
shutil.rmtree(subdir_path) # Optionally remove the original directory after copying
|
||||
|
||||
import re
|
||||
|
||||
def extract_answer(input_string, type):
|
||||
# 使用正则表达式匹配所需的信息
|
||||
# 第一种模式:项目名称、项目编号、招标人、类别
|
||||
# 在每个字段值后都添加了\s*以忽略尾随的空格
|
||||
pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*"
|
||||
# 第二种模式:投标人、类别
|
||||
pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*"
|
||||
|
||||
if type == 1:
|
||||
match = re.search(pattern1, input_string)
|
||||
if match:
|
||||
print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
|
||||
return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
|
||||
else:
|
||||
print("No match found for type 1.")
|
||||
return []
|
||||
elif type == 2:
|
||||
match = re.search(pattern2, input_string)
|
||||
if match:
|
||||
# 检查是否包含“投标函”,如果是则替换部分内容为“商务文件”
|
||||
part = match.group(2).strip()
|
||||
if "投标函" in part:
|
||||
part = "商务文件"
|
||||
return [match.group(1).strip(), part]
|
||||
else:
|
||||
print("No match found for type 2.")
|
||||
return []
|
||||
|
||||
|
||||
from llama_index.readers.dashscope.base import DashScopeParse
|
||||
from llama_index.readers.dashscope.utils import ResultType
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
||||
from dashscope import Assistants, Messages, Runs, Threads
|
||||
|
||||
|
||||
def send_message(assistant, index,documents,message='百炼是什么?'):
|
||||
print(f"Query: {message}")
|
||||
|
||||
# create thread.
|
||||
# create a thread.
|
||||
thread = Threads.create()
|
||||
|
||||
print(thread)
|
||||
|
||||
# create a message.
|
||||
message = Messages.create(thread.id, content=message)
|
||||
# create run
|
||||
|
||||
run = Runs.create(thread.id, assistant_id=assistant.id)
|
||||
print("run:" + str(run))
|
||||
|
||||
# # get run statue
|
||||
run_status = Runs.get(run.id, thread_id=thread.id)
|
||||
# print(run_status)
|
||||
|
||||
# wait for run completed or requires_action
|
||||
run_status = Runs.wait(run.id, thread_id=thread.id)
|
||||
# print(run_status)
|
||||
|
||||
# if prompt input tool result, submit tool result.
|
||||
|
||||
run_status = Runs.get(run.id, thread_id=thread.id)
|
||||
# print(run_status)
|
||||
# verify_status_code(run_status)
|
||||
|
||||
# get the thread messages.
|
||||
msgs = Messages.list(thread.id)
|
||||
# print(msgs)
|
||||
# print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
|
||||
|
||||
ans = []
|
||||
|
||||
print("运行结果:")
|
||||
for message in msgs['data'][::-1]:
|
||||
ans.append(message['content'][0]['text']['value'])
|
||||
print("content: ", message['content'][0]['text']['value'])
|
||||
print("\n")
|
||||
deleteFileFromKnowledge(index,documents)
|
||||
return ans
|
||||
|
||||
|
||||
def file_parse(filepath, knowledge_index, type):
|
||||
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
|
||||
documents = parse.load_data(file_path=filepath)
|
||||
# 创建一个字典来映射index值到知识库名
|
||||
index_to_name = {
|
||||
0: "文件分类知识库0",
|
||||
1: "文件分类知识库1",
|
||||
2: "文件分类知识库2",
|
||||
3: "文件分类知识库3"
|
||||
}
|
||||
# 使用get方法获取对应的知识库名,如果index不存在,返回"默认文件分类知识库"
|
||||
cloud_index_name = index_to_name.get(knowledge_index, "0")
|
||||
|
||||
index = DashScopeCloudIndex(cloud_index_name)
|
||||
index._insert(documents)
|
||||
retriever = index.as_retriever()
|
||||
pipeline_id = str(retriever.pipeline_id)
|
||||
assistant = Assistants.create(
|
||||
model='qwen-plus',
|
||||
name='smart helper',
|
||||
description='智能助手,支持知识库查询和插件调用。',
|
||||
instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}',
|
||||
tools=[
|
||||
{
|
||||
"type": "code_interpreter"
|
||||
},
|
||||
{
|
||||
"type": "rag",
|
||||
"prompt_ra": {
|
||||
"pipeline_id": pipeline_id,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query_word": {
|
||||
"type": "str",
|
||||
"value": "${document1}"
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
)
|
||||
questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\
|
||||
货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,服物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\
|
||||
请按下列键值对格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘"
|
||||
questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’"
|
||||
if (type == 1):
|
||||
questions = questions1
|
||||
elif (type == 2):
|
||||
questions = questions2
|
||||
return send_message(assistant,index,documents, message=questions)
|
||||
|
||||
def deleteFileFromKnowledge(index,documents):
|
||||
# 初始化一个列表来存储所有文档的 ID
|
||||
file_ids = []
|
||||
|
||||
# 检查documents是否为列表且不为空
|
||||
if isinstance(documents, list) and documents:
|
||||
# 遍历每个文档
|
||||
for document in documents:
|
||||
# 使用属性访问方式获取每个文档的 id_
|
||||
# 确保 document 对象有一个名为 id_ 的属性
|
||||
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
|
||||
if file_id:
|
||||
file_ids.append(file_id) # 将 id 添加到列表中
|
||||
index.delete_ref_doc(file_ids)
|
||||
|
||||
import concurrent.futures
|
||||
import logging
|
||||
|
||||
# 配置日志,只输出到控制台
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index):
|
||||
logging.info(f"Starting to process directory: {subdir_path} with index {project_index}")
|
||||
|
||||
# 为每个子目录创建一个专用的临时目录
|
||||
intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}")
|
||||
os.makedirs(intermediate_directory, exist_ok=True)
|
||||
logging.info(f"Created intermediate directory: {intermediate_directory}")
|
||||
|
||||
# 这里可以添加具体的目录处理函数,例如:
|
||||
process_pdf_folders(subdir_path, intermediate_directory)
|
||||
classify_folders(intermediate_directory, final_directory, project_index % 4)
|
||||
|
||||
# 处理完毕后清理该目录
|
||||
shutil.rmtree(intermediate_directory)
|
||||
logging.info(f"Deleted intermediate directory: {intermediate_directory}")
|
||||
|
||||
def main(base_directory, base_intermediate_directory, final_directory):
|
||||
os.makedirs(final_directory, exist_ok=True)
|
||||
logging.info(f"Final directory ensured at: {final_directory}")
|
||||
|
||||
project_index = 0
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||
futures = []
|
||||
for subdir in os.listdir(base_directory):
|
||||
subdir_path = os.path.join(base_directory, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
logging.info(f"Submitting job for directory: {subdir_path}")
|
||||
future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index)
|
||||
futures.append(future)
|
||||
project_index += 1
|
||||
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
future.result() # 如果执行没有抛出异常,完成该任务
|
||||
except Exception as e:
|
||||
logging.error(f"Thread resulted in an error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
|
||||
base_intermediate_directory = 'D:\\tmp'
|
||||
final_directory = 'D:\\output'
|
||||
main(base_directory, base_intermediate_directory, final_directory)
|
@ -1,66 +0,0 @@
|
||||
import fitz # PyMuPDF
|
||||
import re
|
||||
|
||||
def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords):
|
||||
"""
|
||||
提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。
|
||||
:param pdf_path: PDF文件的路径。
|
||||
:param keywords: 包含关键词的列表。
|
||||
:param follow_up_keywords: 触发连续提取的关键词列表。
|
||||
:return: 包含关键词的段落列表。
|
||||
"""
|
||||
doc = fitz.open(pdf_path)
|
||||
extracted_paragraphs = []
|
||||
continue_collecting = False
|
||||
current_section_pattern = None
|
||||
|
||||
for page in doc:
|
||||
text_blocks = page.get_text("blocks")
|
||||
for index, block in enumerate(text_blocks):
|
||||
text = block[4].strip() # Text content of the block
|
||||
if text == "": # Skip empty lines
|
||||
continue
|
||||
if continue_collecting:
|
||||
if current_section_pattern and re.match(current_section_pattern, text):
|
||||
continue_collecting = False
|
||||
else:
|
||||
extracted_paragraphs.append(text)
|
||||
|
||||
if any(keyword in text for keyword in keywords):
|
||||
extracted_paragraphs.append(text)
|
||||
if any(follow_up in text for follow_up in follow_up_keywords):
|
||||
continue_collecting = True
|
||||
section_number = re.match(r'(\d+(\.\d+)*)', text)
|
||||
if section_number:
|
||||
current_section_number = section_number.group(1)
|
||||
base_section_number = current_section_number.rsplit('.', 1)[0]
|
||||
current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b')
|
||||
else:
|
||||
found_next_number = False
|
||||
for next_index in range(index + 1, len(text_blocks)):
|
||||
next_text = text_blocks[next_index][4].strip()
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
|
||||
if next_section_number:
|
||||
found_next_number = True
|
||||
current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b')
|
||||
elif found_next_number:
|
||||
break
|
||||
doc.close()
|
||||
return extracted_paragraphs
|
||||
|
||||
# Example usage
|
||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx'
|
||||
keywords = ['否决', '无效投标', '被拒绝', '予以拒绝']
|
||||
follow_up_keywords = ['情形之一']
|
||||
extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords)
|
||||
|
||||
# Writing to file and handling duplicates
|
||||
output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt"
|
||||
with open(output_file, 'w', encoding='utf-8') as file:
|
||||
for content in extracted_contents:
|
||||
file.write(content + '\n')
|
||||
|
||||
# file_id = upload_file(output_file)
|
||||
# user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。"
|
||||
# res=qianwen_long(file_id,user_query)
|
||||
# print("Query Result:", res)
|
@ -1,305 +0,0 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.old_version.文档理解大模型版知识库处理.知识库操作 import addfileToKnowledge, deleteKnowledge
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
import concurrent.futures
|
||||
from flask_app.old_version.基础信息整合 import combine_basic_info
|
||||
from flask_app.old_version.资格审查模块old import combine_review_standards
|
||||
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||
from flask_app.general.docx截取docx import copy_docx
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
logger=None
|
||||
|
||||
# 创建全局线程池
|
||||
executor = ThreadPoolExecutor()
|
||||
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
||||
logger.info("starting 文件预处理...")
|
||||
logger.info("output_folder..." + output_folder)
|
||||
|
||||
# 根据文件类型处理文件路径
|
||||
if file_type == 1: # docx
|
||||
docx_path = downloaded_file_path
|
||||
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
||||
elif file_type == 2: # pdf
|
||||
pdf_path = downloaded_file_path
|
||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||
else:
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
|
||||
# # 异步上传知识库
|
||||
future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id)
|
||||
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder)
|
||||
|
||||
# 处理各个部分
|
||||
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
|
||||
invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
|
||||
truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json
|
||||
truncate0 = truncate_files[0]
|
||||
truncate1 = truncate_files[1]
|
||||
truncate3 = truncate_files[3]
|
||||
merged_baseinfo_path=truncate_files[-1]
|
||||
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
|
||||
|
||||
logger.info("文件预处理done")
|
||||
|
||||
# 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象
|
||||
return {
|
||||
'file_path': downloaded_file_path,
|
||||
'output_folder': output_folder,
|
||||
'truncate0': truncate0,
|
||||
'truncate1': truncate1,
|
||||
'truncate3': truncate3,
|
||||
'knowledge_future': future_knowledge, # 返回 Future 对象
|
||||
'truncate0_jsonpath': truncate_jsonpath,
|
||||
'merged_baseinfo_path':merged_baseinfo_path,
|
||||
'clause_path': clause_path,
|
||||
'invalid_docpath': invalid_docpath
|
||||
}
|
||||
|
||||
|
||||
def post_processing(data,includes):
|
||||
# 初始化结果字典,预设'其他'分类为空字典
|
||||
result = {"其他": {}}
|
||||
|
||||
# 遍历原始字典的每一个键值对
|
||||
for key, value in data.items():
|
||||
if key in includes:
|
||||
# 如果键在includes列表中,直接保留这个键值对
|
||||
result[key] = value
|
||||
else:
|
||||
# 如果键不在includes列表中,将这个键值对加入到'其他'分类中
|
||||
result["其他"][key] = value
|
||||
|
||||
# 如果'其他'分类没有任何内容,可以选择删除这个键
|
||||
if not result["其他"]:
|
||||
del result["其他"]
|
||||
|
||||
return result
|
||||
# 基本信息
|
||||
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
|
||||
logger.info("starting基础信息...")
|
||||
basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
||||
logger.info("基础信息done")
|
||||
return basic_res
|
||||
|
||||
|
||||
# 形式、响应、资格评审
|
||||
def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
|
||||
logger.info("starting资格审查...")
|
||||
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
|
||||
clause_path,input_file,output_folder)
|
||||
logger.info("资格审查done")
|
||||
return review_standards_res
|
||||
|
||||
|
||||
# 评分细则 流式
|
||||
def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
||||
logger.info("starting 商务标和技术标...")
|
||||
|
||||
# 获取评标办法前附表的字典结果
|
||||
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
||||
|
||||
# 获取技术标和商务标
|
||||
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
|
||||
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
|
||||
|
||||
logger.info("商务标和技术标 done")
|
||||
|
||||
# 返回将 "技术标" 和 "商务标" 包含在新的键中
|
||||
return {
|
||||
"technical_standards": technical_standards,
|
||||
"commercial_standards": commercial_standards
|
||||
}
|
||||
|
||||
# def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
||||
# logger.info("starting商务标技术标...")
|
||||
# evaluation_standards_res = combine_evaluation_standards(truncate1)
|
||||
# logger.info("商务标技术标done")
|
||||
# return evaluation_standards_res
|
||||
|
||||
|
||||
# 无效、废标项解析
|
||||
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
|
||||
# 废标项要求:千问
|
||||
logger.info("starting无效标与废标...")
|
||||
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
|
||||
logger.info("无效标与废标done...")
|
||||
return find_invalid_res
|
||||
|
||||
|
||||
# 投标文件要求
|
||||
def fetch_bidding_documents_requirements(clause_path):
|
||||
logger.info("starting投标文件要求...")
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
||||
logger.info("投标文件要求done...")
|
||||
return {"投标文件要求":fetch_bidding_documents_requirements_json}
|
||||
|
||||
# 开评定标流程
|
||||
def fetch_bid_opening(clause_path):
|
||||
logger.info("starting开评定标流程...")
|
||||
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
||||
logger.info("开评定标流程done...")
|
||||
return {"开评定标流程":fetch_bid_opening_json}
|
||||
|
||||
# def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
|
||||
# global logger
|
||||
# logger = get_global_logger(unique_id)
|
||||
# # Preprocess files and get necessary data paths and knowledge index
|
||||
# processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
|
||||
# if not processed_data:
|
||||
# return ""
|
||||
#
|
||||
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# # Submit all tasks to the executor
|
||||
# futures = {
|
||||
# 'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'],
|
||||
# processed_data['truncate0'], output_folder,
|
||||
# processed_data['clause_path']),
|
||||
# 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
|
||||
# processed_data['truncate3'],
|
||||
# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
|
||||
# processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']),
|
||||
# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
|
||||
# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
|
||||
# output_folder, processed_data['truncate0_jsonpath'],
|
||||
# processed_data['clause_path'], processed_data['truncate3']),
|
||||
# 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
|
||||
# processed_data['clause_path']),
|
||||
# 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
|
||||
# }
|
||||
#
|
||||
# comprehensive_responses = []
|
||||
# # Collect results in the defined order
|
||||
# for key in ['base_info', 'qualification_review', 'evaluation_standards', 'invalid_requirements',
|
||||
# 'bidding_documents_requirements', 'opening_bid']:
|
||||
# try:
|
||||
# # Wait for the future to complete and get the result
|
||||
# result = futures[key].result()
|
||||
# comprehensive_responses.append(result)
|
||||
# except Exception as exc:
|
||||
# logger.error(f"Error processing {key}: {exc}")
|
||||
# # 合并 JSON 结果
|
||||
# combined_final_result = combine_json_results(comprehensive_responses)
|
||||
# includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
||||
# result = post_processing(combined_final_result, includes)
|
||||
# modified_json = transform_json_values(result)
|
||||
#
|
||||
# final_result_path = os.path.join(output_folder, "final_result.json")
|
||||
# with open(final_result_path, 'w', encoding='utf-8') as file:
|
||||
# json.dump(modified_json, file, ensure_ascii=False, indent=2)
|
||||
# logger.info("final_result.json has been saved")
|
||||
# deleteKnowledge(processed_data['knowledge_index'])
|
||||
# return final_result_path
|
||||
|
||||
|
||||
#分段返回
|
||||
def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id):
|
||||
global logger
|
||||
logger = get_global_logger(unique_id)
|
||||
|
||||
# 预处理文件,获取处理后的数据
|
||||
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
|
||||
if not processed_data:
|
||||
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# 立即启动不依赖 knowledge_name 和 index 的任务
|
||||
futures = {
|
||||
'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
|
||||
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
|
||||
output_folder, processed_data['truncate0_jsonpath'],
|
||||
processed_data['clause_path'], processed_data['truncate3']),
|
||||
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']),
|
||||
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
|
||||
}
|
||||
|
||||
# 提前处理这些不依赖的任务,按完成顺序返回
|
||||
for future in concurrent.futures.as_completed(futures.values()):
|
||||
key = next(k for k, v in futures.items() if v == future)
|
||||
try:
|
||||
result = future.result()
|
||||
|
||||
# 如果是 evaluation_standards,拆分技术标和商务标
|
||||
if key == 'evaluation_standards':
|
||||
technical_standards = result["technical_standards"]
|
||||
commercial_standards = result["commercial_standards"]
|
||||
|
||||
# 分别返回技术标和商务标
|
||||
yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False)
|
||||
yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False)
|
||||
|
||||
else:
|
||||
# 处理其他任务的结果
|
||||
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(f"Error processing {key}: {exc}")
|
||||
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
|
||||
|
||||
# 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成
|
||||
try:
|
||||
knowledge_name = "招标解析" + unique_id
|
||||
index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成
|
||||
|
||||
# 提交依赖 knowledge_name 和 index 的任务
|
||||
future_dependencies = {
|
||||
'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'],
|
||||
output_folder, processed_data['clause_path']),
|
||||
'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
|
||||
processed_data['truncate3'], knowledge_name,
|
||||
processed_data['truncate0_jsonpath'],
|
||||
processed_data['clause_path'], processed_data['file_path'],
|
||||
processed_data['output_folder']),
|
||||
}
|
||||
|
||||
# 按完成顺序返回依赖任务的结果
|
||||
for future in concurrent.futures.as_completed(future_dependencies.values()):
|
||||
key = next(k for k, v in future_dependencies.items() if v == future)
|
||||
try:
|
||||
result = future.result()
|
||||
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
|
||||
except Exception as exc:
|
||||
logger.error(f"Error processing {key}: {exc}")
|
||||
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading to knowledge base: {e}")
|
||||
yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False)
|
||||
|
||||
# 删除知识索引
|
||||
deleteKnowledge(index)
|
||||
|
||||
if __name__ == "__main__":
|
||||
output_folder = "flask_app/static/output/zytest1"
|
||||
# truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
|
||||
# truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
|
||||
# clause_path = convert_clause_to_json(truncate1, output_folder)
|
||||
# truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json")
|
||||
|
||||
start_time = time.time()
|
||||
file_type = 1 #1:docx 2:pdf 3:其他
|
||||
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
|
||||
# file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
|
||||
|
||||
preprocess_files(output_folder, input_file, file_type, "zytest1")
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time # 计算耗时
|
||||
print(f"Function execution took {elapsed_time} seconds.")
|
@ -1,350 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import shutil
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
|
||||
|
||||
def validate_pdf(file_path):
|
||||
""" 验证PDF文件是否损坏 """
|
||||
try:
|
||||
with open(file_path, "rb") as file:
|
||||
pdf = PdfReader(file)
|
||||
return len(pdf.pages) > 0
|
||||
except Exception as e:
|
||||
print(f"Error reading PDF {file_path}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def truncate_pdf(source_path, target_path, max_pages=15):
|
||||
"""截取PDF文件的前15页并保存"""
|
||||
try:
|
||||
with open(source_path, "rb") as infile:
|
||||
reader = PdfReader(infile)
|
||||
writer = PdfWriter()
|
||||
for i in range(min(max_pages, len(reader.pages))):
|
||||
writer.add_page(reader.pages[i])
|
||||
with open(target_path, "wb") as outfile:
|
||||
writer.write(outfile)
|
||||
except Exception as e:
|
||||
print(f"Error processing PDF {source_path}: {str(e)}")
|
||||
|
||||
def copy_file(src, dest):
|
||||
""" 复制单个文件 """
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
shutil.copy2(src, dest)
|
||||
|
||||
def copy_directory(source, destination):
|
||||
"""复制整个目录"""
|
||||
os.makedirs(destination, exist_ok=True)
|
||||
for item in os.listdir(source):
|
||||
src_path = os.path.join(source, item)
|
||||
dest_path = os.path.join(destination, item)
|
||||
if os.path.isfile(src_path):
|
||||
copy_file(src_path, dest_path)
|
||||
else:
|
||||
copy_directory(src_path, dest_path)
|
||||
|
||||
def unique_file_name(base_path, file_name):
|
||||
counter = 1
|
||||
name_part, extension = os.path.splitext(file_name)
|
||||
new_file_name = file_name
|
||||
# 检查文件是否存在,若存在则修改文件名
|
||||
while os.path.exists(os.path.join(base_path, new_file_name)):
|
||||
new_file_name = f"{name_part}_{counter}{extension}"
|
||||
counter += 1
|
||||
return new_file_name
|
||||
|
||||
def process_pdf_folders(source_dir, target_dir):
|
||||
""" 处理源目录中的PDF文件,并基于条件选择目标文件夹 """
|
||||
for root, dirs, files in os.walk(source_dir, topdown=False):
|
||||
for file in files:
|
||||
if file.lower().endswith('.pdf'):
|
||||
source_file_path = os.path.join(root, file)
|
||||
if validate_pdf(source_file_path):
|
||||
relative_path = os.path.relpath(root, source_dir)
|
||||
target_file_dir = os.path.join(target_dir, relative_path)
|
||||
target_file_path = os.path.join(target_file_dir, file)
|
||||
copy_file(source_file_path, target_file_path)
|
||||
else:
|
||||
print(f"Deleted corrupt file: {source_file_path}")
|
||||
# 清除空目录
|
||||
if not os.listdir(root):
|
||||
os.rmdir(root)
|
||||
|
||||
|
||||
def classify_folders(source_dir, target_dir, project_index):
|
||||
"""Classifies folders and processes files based on specific criteria."""
|
||||
temp_dir = os.path.join(source_dir, 'temp')
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
target_project_dir = None
|
||||
processed_dirs = set() # Set to track processed directories
|
||||
|
||||
for subdir in os.listdir(source_dir):
|
||||
subdir_path = os.path.join(source_dir, subdir)
|
||||
if not os.path.isdir(subdir_path):
|
||||
continue
|
||||
|
||||
files = [f.lower() for f in os.listdir(subdir_path)]
|
||||
if 'zb.pdf' in files:
|
||||
target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
|
||||
processed_dirs.add(subdir_path) # Mark this directory as processed
|
||||
elif subdir.lower() == "输出文件":
|
||||
process_evaluation_files(subdir_path, target_project_dir)
|
||||
processed_dirs.add(subdir_path) # Mark this directory as processed
|
||||
|
||||
# Process remaining folders, skipping already processed ones
|
||||
process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
|
||||
#删除tmp目录
|
||||
# if os.path.exists(temp_dir):
|
||||
# shutil.rmtree(temp_dir)
|
||||
if os.path.exists(source_dir):
|
||||
shutil.rmtree(source_dir)
|
||||
|
||||
def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
|
||||
"""Processes tender files and returns the target project directory and updated index."""
|
||||
zb_path = os.path.join(subdir_path, "zb.pdf")
|
||||
truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
|
||||
truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages
|
||||
bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1)
|
||||
project_index += 1
|
||||
zb_response = extract_answer(bot_response[1], 1)
|
||||
zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠
|
||||
target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
|
||||
|
||||
new_zb_path = os.path.join(subdir_path, target_zb_name)
|
||||
os.rename(zb_path, new_zb_path)
|
||||
|
||||
target_category_dir = os.path.join(target_dir, category)
|
||||
target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
|
||||
copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
|
||||
|
||||
os.remove(truncated_zb_path)
|
||||
shutil.rmtree(subdir_path)
|
||||
return target_project_dir, project_index,zb_response[2]
|
||||
|
||||
|
||||
def process_evaluation_files(subdir_path, target_project_dir):
|
||||
"""Processes evaluation folders."""
|
||||
copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
|
||||
shutil.rmtree(subdir_path)
|
||||
|
||||
|
||||
def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
|
||||
"""Processes remaining folders containing bid files."""
|
||||
target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
|
||||
|
||||
for subdir in os.listdir(source_dir):
|
||||
subdir_path = os.path.join(source_dir, subdir)
|
||||
if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
|
||||
continue # Skip processed directories
|
||||
|
||||
target_tbcom_dir = None # Initialize outside the file loop
|
||||
|
||||
for item in os.listdir(subdir_path):
|
||||
item_src_path = os.path.join(subdir_path, item)
|
||||
new_name = "truncate_" + item
|
||||
truncated_tb_path = os.path.join(temp_dir, new_name)
|
||||
truncate_pdf(item_src_path, truncated_tb_path)
|
||||
bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2)
|
||||
project_index += 1
|
||||
tb_response = extract_answer(bot_response[1], 2)
|
||||
if not tb_response:
|
||||
continue # If there's no response, skip this file
|
||||
|
||||
# Initialize target_tbcom_dir only once based on the first processed file
|
||||
if not target_tbcom_dir:
|
||||
target_tb_name, _ = tb_response
|
||||
if(target_tb_name != target_zb_name and target_tb_name != "未知"):
|
||||
target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
|
||||
print(target_tbcom_dir)
|
||||
os.makedirs(target_tbcom_dir, exist_ok=True)
|
||||
|
||||
tb_section = tb_response[1] + ".pdf"
|
||||
new_tb_path = os.path.join(subdir_path, tb_section)
|
||||
# 获取唯一文件名
|
||||
new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
|
||||
os.rename(item_src_path, new_tb_path)
|
||||
|
||||
# Remove temporary truncated file
|
||||
os.remove(truncated_tb_path)
|
||||
|
||||
# Copy the whole directory at once after all files have been processed and renamed
|
||||
if target_tbcom_dir:
|
||||
copy_directory(subdir_path, target_tbcom_dir)
|
||||
shutil.rmtree(subdir_path) # Optionally remove the original directory after copying
|
||||
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
import re
|
||||
|
||||
def extract_answer(input_string, type):
|
||||
# 使用正则表达式匹配所需的信息
|
||||
# 第一种模式:项目名称、项目编号、招标人、类别
|
||||
# 在每个字段值后都添加了\s*以忽略尾随的空格
|
||||
pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*"
|
||||
# 第二种模式:投标人、类别
|
||||
pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*"
|
||||
|
||||
if type == 1:
|
||||
match = re.search(pattern1, input_string)
|
||||
if match:
|
||||
print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
|
||||
return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
|
||||
else:
|
||||
print("No match found for type 1.")
|
||||
return []
|
||||
elif type == 2:
|
||||
match = re.search(pattern2, input_string)
|
||||
if match:
|
||||
# 检查是否包含“投标函”,如果是则替换部分内容为“商务文件”
|
||||
part = match.group(2).strip()
|
||||
if "投标函" in part:
|
||||
part = "商务文件"
|
||||
return [match.group(1).strip(), part]
|
||||
else:
|
||||
print("No match found for type 2.")
|
||||
return []
|
||||
|
||||
|
||||
from llama_index.readers.dashscope.base import DashScopeParse
|
||||
from llama_index.readers.dashscope.utils import ResultType
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
||||
from dashscope import Assistants, Messages, Runs, Threads
|
||||
|
||||
|
||||
def send_message(assistant, index,documents,message='百炼是什么?'):
|
||||
print(f"Query: {message}")
|
||||
|
||||
# create thread.
|
||||
# create a thread.
|
||||
thread = Threads.create()
|
||||
|
||||
print(thread)
|
||||
|
||||
# create a message.
|
||||
message = Messages.create(thread.id, content=message)
|
||||
# create run
|
||||
|
||||
run = Runs.create(thread.id, assistant_id=assistant.id)
|
||||
print("run:" + str(run))
|
||||
|
||||
# # get run statue
|
||||
run_status = Runs.get(run.id, thread_id=thread.id)
|
||||
# print(run_status)
|
||||
|
||||
# wait for run completed or requires_action
|
||||
run_status = Runs.wait(run.id, thread_id=thread.id)
|
||||
# print(run_status)
|
||||
|
||||
# if prompt input tool result, submit tool result.
|
||||
|
||||
run_status = Runs.get(run.id, thread_id=thread.id)
|
||||
# print(run_status)
|
||||
# verify_status_code(run_status)
|
||||
|
||||
# get the thread messages.
|
||||
msgs = Messages.list(thread.id)
|
||||
# print(msgs)
|
||||
# print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
|
||||
|
||||
ans = []
|
||||
|
||||
print("运行结果:")
|
||||
for message in msgs['data'][::-1]:
|
||||
ans.append(message['content'][0]['text']['value'])
|
||||
print("content: ", message['content'][0]['text']['value'])
|
||||
print("\n")
|
||||
deleteFileFromKnowledge(index,documents)
|
||||
return ans
|
||||
|
||||
|
||||
def file_parse(filepath, knowledge_index, type):
|
||||
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
|
||||
documents = parse.load_data(file_path=filepath)
|
||||
index = DashScopeCloudIndex("文件分类临时知识库")
|
||||
index._insert(documents)
|
||||
retriever = index.as_retriever()
|
||||
pipeline_id = str(retriever.pipeline_id)
|
||||
assistant = Assistants.create(
|
||||
model='qwen-max',
|
||||
name='smart helper',
|
||||
description='智能助手,支持知识库查询和插件调用。',
|
||||
instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}',
|
||||
tools=[
|
||||
{
|
||||
"type": "code_interpreter"
|
||||
},
|
||||
{
|
||||
"type": "rag",
|
||||
"prompt_ra": {
|
||||
"pipeline_id": pipeline_id,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query_word": {
|
||||
"type": "str",
|
||||
"value": "${document1}"
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}]
|
||||
)
|
||||
questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\
|
||||
货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\
|
||||
请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘"
|
||||
questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’"
|
||||
if (type == 1):
|
||||
questions = questions1
|
||||
elif (type == 2):
|
||||
questions = questions2
|
||||
return send_message(assistant,index,documents, message=questions)
|
||||
|
||||
def deleteFileFromKnowledge(index,documents):
|
||||
# 初始化一个列表来存储所有文档的 ID
|
||||
file_ids = []
|
||||
|
||||
# 检查documents是否为列表且不为空
|
||||
if isinstance(documents, list) and documents:
|
||||
# 遍历每个文档
|
||||
for document in documents:
|
||||
# 使用属性访问方式获取每个文档的 id_
|
||||
# 确保 document 对象有一个名为 id_ 的属性
|
||||
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
|
||||
if file_id:
|
||||
file_ids.append(file_id) # 将 id 添加到列表中
|
||||
index.delete_ref_doc(file_ids)
|
||||
|
||||
def process_directory(base_directory, intermediate_directory, final_directory):
|
||||
# 遍历base_directory下的所有子目录
|
||||
for subdir in os.listdir(base_directory):
|
||||
subdir_path = os.path.join(base_directory, subdir)
|
||||
# 确保是目录
|
||||
if os.path.isdir(subdir_path):
|
||||
# 处理每个项目的目录
|
||||
process_pdf_folders(subdir_path, intermediate_directory)
|
||||
classify_folders(intermediate_directory, final_directory, 0)
|
||||
|
||||
# 清理临时目录以备下一个项目使用
|
||||
if os.path.exists(intermediate_directory):
|
||||
shutil.rmtree(intermediate_directory)
|
||||
|
||||
def main(base_directory, intermediate_directory, final_directory):
|
||||
# 确保中间目录和最终目录存在
|
||||
os.makedirs(intermediate_directory, exist_ok=True)
|
||||
os.makedirs(final_directory, exist_ok=True)
|
||||
process_directory(base_directory, intermediate_directory, final_directory)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
|
||||
intermediate_directory = 'D:\\tmp'
|
||||
final_directory = 'D:\\output'
|
||||
main(base_directory, intermediate_directory, final_directory) #处理多级目录
|
||||
# process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录
|
||||
# classify_folders(intermediate_directory, final_directory, 0)
|
@ -1,55 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# This file is auto-generated, don't edit it. Thanks.
|
||||
import os
|
||||
from alibabacloud_bailian20231229.client import Client as bailian20231229Client
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_bailian20231229 import models as bailian_20231229_models
|
||||
from alibabacloud_tea_util import models as util_models
|
||||
from alibabacloud_tea_util.client import Client as UtilClient
|
||||
|
||||
def create_client() -> bailian20231229Client:
|
||||
"""
|
||||
使用AK&SK初始化账号Client
|
||||
@return: Client
|
||||
@throws Exception
|
||||
"""
|
||||
config = open_api_models.Config(
|
||||
access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'],
|
||||
access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET']
|
||||
)
|
||||
config.endpoint = 'bailian.cn-beijing.aliyuncs.com'
|
||||
return bailian20231229Client(config)
|
||||
|
||||
def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None:
|
||||
delete_index_request = bailian_20231229_models.DeleteIndexRequest(
|
||||
index_id=index_id
|
||||
)
|
||||
runtime = util_models.RuntimeOptions()
|
||||
headers = {}
|
||||
try:
|
||||
response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime)
|
||||
print("routes Response:", response)
|
||||
except Exception as error:
|
||||
print(error.message)
|
||||
print(error.data.get("Recommend"))
|
||||
UtilClient.assert_as_string(error.message)
|
||||
|
||||
async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None:
|
||||
delete_index_request = bailian_20231229_models.DeleteIndexRequest(
|
||||
index_id=index_id
|
||||
)
|
||||
runtime = util_models.RuntimeOptions()
|
||||
headers = {}
|
||||
try:
|
||||
response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime)
|
||||
print("routes Response:", response)
|
||||
except Exception as error:
|
||||
print(error.message)
|
||||
print(error.data.get("Recommend"))
|
||||
UtilClient.assert_as_string(error.message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID']
|
||||
index_id = 'pg5rrsv26x'
|
||||
client = create_client()
|
||||
delete_index(client, workspace_id, index_id)
|
@ -1,58 +0,0 @@
|
||||
import os
|
||||
import uuid
|
||||
from llama_index.readers.dashscope.base import DashScopeParse
|
||||
from llama_index.readers.dashscope.utils import ResultType
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
||||
from flask_app.old_version.文档理解大模型版知识库处理.删除知识库 import delete_index, create_client
|
||||
|
||||
|
||||
def addfileToKnowledge(filepath,knowledge_name):
|
||||
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
|
||||
documents = parse.load_data(file_path=filepath)
|
||||
index = DashScopeCloudIndex.from_documents(
|
||||
documents,
|
||||
knowledge_name,
|
||||
verbose=True,
|
||||
)
|
||||
print("knowledge created successfully!!!")
|
||||
# index = DashScopeCloudIndex(knowledge_name)
|
||||
# index._insert(documents)
|
||||
# return index, documents
|
||||
return index
|
||||
|
||||
def deleteKnowledge(index):
|
||||
retriever = index.as_retriever()
|
||||
index_id = str(retriever.pipeline_id)
|
||||
workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID')
|
||||
client = create_client()
|
||||
delete_index(client,workspace_id,index_id)
|
||||
print("knowledge old_version successfully!!!")
|
||||
|
||||
|
||||
|
||||
def deleteFileFromKnowledge(index, documents):
|
||||
# 初始化一个列表来存储所有文档的 ID
|
||||
file_ids = []
|
||||
# 检查documents是否为列表且不为空
|
||||
if isinstance(documents, list) and documents:
|
||||
# 遍历每个文档
|
||||
for document in documents:
|
||||
# 使用属性访问方式获取每个文档的 id_
|
||||
# 确保 document 对象有一个名为 id_ 的属性
|
||||
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
|
||||
if file_id:
|
||||
file_ids.append(file_id) # 将 id 添加到列表中
|
||||
print("old_version successfully")
|
||||
index.delete_ref_doc(file_ids)
|
||||
|
||||
|
||||
# 示例用法
|
||||
if __name__ == "__main__":
|
||||
filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf"
|
||||
unique_id = str(uuid.uuid4())
|
||||
knowledge_name="招标解析"+unique_id
|
||||
# index = addfileToKnowledge(filepath,knowledge_name)
|
||||
index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff")
|
||||
# 删除文件
|
||||
# deleteFileFromKnowledge(index, document)
|
||||
deleteKnowledge(index)
|
@ -1,33 +0,0 @@
|
||||
from typing import List
|
||||
from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
|
||||
from alibabacloud_tea_util.client import Client as UtilClient
|
||||
from alibabacloud_credentials.client import Client as CredClient
|
||||
|
||||
def query():
|
||||
# 使用默认凭证初始化Credentials Client。
|
||||
cred=CredClient()
|
||||
config = open_api_models.Config(
|
||||
# 通过credentials获取配置中的AccessKey ID
|
||||
cred.get_credential().access_key_id,
|
||||
# 通过credentials获取配置中的AccessKey Secret
|
||||
cred.get_credential().access_key_secret,
|
||||
)
|
||||
# 访问的域名
|
||||
config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
|
||||
client = docmind_api20220711Client(config)
|
||||
request = docmind_api20220711_models.QueryDocParserStatusRequest(
|
||||
# id : 任务提交接口返回的id
|
||||
id='docmind-20241008-24363df30d274863894f037dbb7244e8'
|
||||
)
|
||||
try:
|
||||
# 复制代码运行请自行打印 routes 的返回值
|
||||
response = client.query_doc_parser_status(request)
|
||||
# API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头
|
||||
# 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。
|
||||
print(response.body.data.status)
|
||||
except Exception as error:
|
||||
# 如有需要,请打印 error
|
||||
UtilClient.assert_as_string(error.message)
|
||||
query()
|
@ -1,40 +0,0 @@
|
||||
from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
|
||||
from alibabacloud_tea_util.client import Client as UtilClient
|
||||
from alibabacloud_tea_util import models as util_models
|
||||
from alibabacloud_credentials.client import Client as CredClient
|
||||
|
||||
def submit_file():
|
||||
# 使用默认凭证初始化Credentials Client。
|
||||
cred=CredClient()
|
||||
config = open_api_models.Config(
|
||||
# 通过credentials获取配置中的AccessKey ID
|
||||
# access_key_id=cred.get_access_key_id(),
|
||||
cred.get_credential().access_key_id,
|
||||
# 通过credentials获取配置中的AccessKey Secret
|
||||
# access_key_secret=cred.get_access_key_secret()
|
||||
cred.get_credential().access_key_secret,
|
||||
)
|
||||
# 访问的域名
|
||||
config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
|
||||
client = docmind_api20220711Client(config)
|
||||
request = docmind_api20220711_models.SubmitDocParserJobAdvanceRequest(
|
||||
# file_url_object : 本地文件流
|
||||
file_url_object=open("zbtest4.pdf", "rb"),
|
||||
# file_name :文件名称。名称必须包含文件类型
|
||||
file_name='zbtest4.pdf'
|
||||
# file_name_extension : 文件后缀格式。与文件名二选一
|
||||
# file_name_extension='pdf'
|
||||
)
|
||||
runtime = util_models.RuntimeOptions()
|
||||
try:
|
||||
# 复制代码运行请自行打印 routes 的返回值
|
||||
response = client.submit_doc_parser_job_advance(request, runtime)
|
||||
# API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。如下示例为打印返回的业务id格式
|
||||
# 获取属性值均以小写开头,
|
||||
print(response.body.data.id)
|
||||
except Exception as error:
|
||||
# 如有需要,请打印 error
|
||||
UtilClient.assert_as_string(error.message)
|
||||
submit_file()
|
@ -1,35 +0,0 @@
|
||||
from typing import List
|
||||
from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
|
||||
from alibabacloud_tea_openapi import models as open_api_models
|
||||
from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
|
||||
from alibabacloud_tea_util.client import Client as UtilClient
|
||||
from alibabacloud_credentials.client import Client as CredClient
|
||||
|
||||
def query():
|
||||
# 使用默认凭证初始化Credentials Client。
|
||||
cred=CredClient()
|
||||
config = open_api_models.Config(
|
||||
# 通过credentials获取配置中的AccessKey ID
|
||||
cred.get_credential().access_key_id,
|
||||
# 通过credentials获取配置中的AccessKey Secret
|
||||
cred.get_credential().access_key_secret,
|
||||
)
|
||||
# 访问的域名
|
||||
config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
|
||||
client = docmind_api20220711Client(config)
|
||||
request = docmind_api20220711_models.GetDocParserResultRequest(
|
||||
# id : 任务提交接口返回的id
|
||||
id='docmind-20241008-24363df30d274863894f037dbb7244e8',
|
||||
layout_step_size=10,
|
||||
layout_num=0
|
||||
)
|
||||
try:
|
||||
# 复制代码运行请自行打印 routes 的返回值
|
||||
response = client.get_doc_parser_result(request)
|
||||
# API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头
|
||||
# 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。
|
||||
print(response.body.data)
|
||||
except Exception as error:
|
||||
# 如有需要,请打印 error
|
||||
UtilClient.assert_as_string(error.message)
|
||||
query()
|
@ -1,440 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import os.path
|
||||
import time
|
||||
import re
|
||||
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.old_version.不得存在及禁止投标情形 import find_forbidden, process_string_list
|
||||
|
||||
|
||||
#处理跨页的段落
|
||||
def preprocess_paragraphs(paragraphs):
|
||||
processed = [] # 初始化处理后的段落列表
|
||||
index = 0
|
||||
while index < len(paragraphs):
|
||||
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
||||
|
||||
# 检查当前段落是否为空
|
||||
if current_text == '':
|
||||
# 确保有前一个和后一个段落
|
||||
if 0 < index < len(paragraphs) - 1:
|
||||
prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本
|
||||
next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本
|
||||
|
||||
# 检查前一个段落的文本是否不以标点符号结尾
|
||||
if not prev_text.endswith((',', ',', '。', '!', '?')):
|
||||
# 定义列表项的模式
|
||||
list_item_pattern = r'^\s*([($]\d+[)$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
|
||||
# 检查前一个段落是否以列表模式开头,且后一个段落不以列表模式开头
|
||||
# is_prev_list = re.match(list_item_pattern, prev_text)
|
||||
is_next_list = re.match(list_item_pattern, next_text)
|
||||
if not is_next_list and len(prev_text) > 30:
|
||||
# 合并前一个和后一个段落的文本
|
||||
merged_text = prev_text + next_text # 为了可读性添加空格
|
||||
if processed:
|
||||
processed[-1] = merged_text # 更新处理后的最后一个段落
|
||||
else:
|
||||
processed.append(merged_text) # 如果列表为空,直接添加合并后的文本
|
||||
|
||||
# 跳过下一个段落,因为它已经被合并
|
||||
index += 2
|
||||
continue
|
||||
else:
|
||||
# 非空段落,添加到处理后的列表中
|
||||
processed.append(current_text)
|
||||
index += 1
|
||||
|
||||
return processed
|
||||
|
||||
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
||||
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
||||
|
||||
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
from collections import OrderedDict
|
||||
from docx import Document
|
||||
import re
|
||||
|
||||
if isinstance(keywords, str):
|
||||
keywords = [keywords]
|
||||
|
||||
doc = Document(doc_path)
|
||||
extracted_paragraphs = OrderedDict()
|
||||
continue_collecting = False
|
||||
current_section_pattern = None
|
||||
active_key = None
|
||||
|
||||
def match_keywords(text, patterns):
|
||||
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
|
||||
|
||||
def extract_from_text(text, current_index):
|
||||
nonlocal continue_collecting, current_section_pattern, active_key
|
||||
if text == "":
|
||||
return current_index
|
||||
|
||||
if continue_collecting:
|
||||
if current_section_pattern and re.match(current_section_pattern, text):
|
||||
continue_collecting = False
|
||||
active_key = None
|
||||
else:
|
||||
if active_key is not None:
|
||||
extracted_paragraphs[active_key].append(text)
|
||||
return current_index
|
||||
|
||||
if match_keywords(text, keywords):
|
||||
active_key = text
|
||||
extracted_paragraphs[active_key] = [text]
|
||||
if match_keywords(text, follow_up_keywords):
|
||||
continue_collecting = True
|
||||
section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.'
|
||||
if section_number:
|
||||
current_section_number = section_number.group(1)
|
||||
level_count = current_section_number.count('.')
|
||||
|
||||
# Pattern to match current level, e.g., 3.4.5
|
||||
pattern = r'^' + (r'\d+\s*[..]\s*') * level_count + r'\d+'
|
||||
# Generate patterns for next section at same level and parent level
|
||||
parts = current_section_number.split('.')
|
||||
matched_patterns = [pattern] # start with the full pattern
|
||||
|
||||
# Next section at same level
|
||||
parts[-1] = str(int(parts[-1]) + 1)
|
||||
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
|
||||
matched_patterns.append(next_pattern)
|
||||
|
||||
# Parent section (if applicable)
|
||||
if len(parts) > 1:
|
||||
parent_section_parts = parts[:-1]
|
||||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
|
||||
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
|
||||
matched_patterns.append(parent_pattern)
|
||||
|
||||
# Combine the patterns
|
||||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||||
current_section_pattern = re.compile(combined_pattern)
|
||||
|
||||
else:
|
||||
found_next_number = False
|
||||
current_section_pattern = None
|
||||
|
||||
while current_index < len(doc.paragraphs) - 1:
|
||||
current_index += 1
|
||||
next_text = doc.paragraphs[current_index].text.strip()
|
||||
if not found_next_number:
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
|
||||
if next_section_number:
|
||||
found_next_number = True
|
||||
if next_section_number.group(1):
|
||||
section_parts = next_section_number.group(1).split('.')
|
||||
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||||
elif next_section_number.group(2):
|
||||
dynamic_pattern = r'^[\(\(]\d+[\)\)]'
|
||||
current_section_pattern = re.compile(dynamic_pattern)
|
||||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||||
extracted_paragraphs[active_key].append(next_text)
|
||||
else:
|
||||
continue_collecting = False
|
||||
active_key=None
|
||||
break
|
||||
|
||||
return current_index
|
||||
|
||||
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
|
||||
index = 0
|
||||
while index < len(processed_paragraphs):
|
||||
index = extract_from_text(processed_paragraphs[index].strip(), index)
|
||||
index += 1
|
||||
|
||||
return extracted_paragraphs
|
||||
|
||||
def preprocess_text_list(text_list):
|
||||
new_text_list = []
|
||||
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
|
||||
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))')
|
||||
for text in text_list:
|
||||
# 使用正则表达式检查并拆分元素
|
||||
parts = split_pattern.split(text)
|
||||
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
|
||||
|
||||
return new_text_list
|
||||
|
||||
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
|
||||
all_texts1 = []
|
||||
all_texts2=[]
|
||||
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
|
||||
split_pattern = r'(?<=[。!?\!\?])'
|
||||
|
||||
for key, text_list in extracted_contents.items():
|
||||
if len(text_list) == 1:
|
||||
for data in text_list:
|
||||
# 检查是否包含任何需要排除的字符串
|
||||
if any(exclude in data for exclude in excludes):
|
||||
continue # 如果包含任何排除字符串,跳过这个数据
|
||||
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||
data = re.sub(pattern, '', data).strip()
|
||||
keyword_match = re.search(keywords, data)
|
||||
if keyword_match:
|
||||
# 从关键词位置开始查找结束标点符号
|
||||
start_pos = keyword_match.start()
|
||||
# 截取从关键词开始到后面的内容
|
||||
substring = data[start_pos:]
|
||||
# 按定义的结束标点分割
|
||||
sentences = re.split(split_pattern, substring, 1)
|
||||
if len(sentences) > 0 and sentences[0]:
|
||||
# 只取第一句,保留标点
|
||||
cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
|
||||
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
|
||||
else:
|
||||
cleaned_text = data # 如果没有标点,使用整个字符串
|
||||
else:
|
||||
# 如果没有找到关键词,保留原文本
|
||||
cleaned_text = data
|
||||
# 删除空格
|
||||
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
|
||||
# 如果长度大于8,则添加到结果列表
|
||||
if len(cleaned_text_no_spaces) > 8:
|
||||
all_texts1.append(cleaned_text_no_spaces)
|
||||
|
||||
else:
|
||||
# print(text_list)
|
||||
new_text_list=preprocess_text_list(text_list)
|
||||
# print(new_text_list)
|
||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||
# data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
|
||||
data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip()
|
||||
# 将修改后的第一个元素和剩余的元素连接起来
|
||||
new_text_list[0] = data # 更新列表中的第一个元素
|
||||
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
|
||||
# 删除空格
|
||||
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
|
||||
all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中
|
||||
|
||||
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
||||
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
|
||||
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
||||
sentences1 = [] # 保存没有后续关键词的情况
|
||||
sentences2 = [] # 保存有后续关键词的情况
|
||||
|
||||
if isinstance(data, dict):
|
||||
for value in data.values():
|
||||
result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
|
||||
sentences1.extend(result1)
|
||||
sentences2.extend(result2)
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
|
||||
sentences1.extend(result1)
|
||||
sentences2.extend(result2)
|
||||
elif isinstance(data, str):
|
||||
# 分割句子,保证句子完整性(按标点符号和序号分割)
|
||||
# split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割
|
||||
split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', data)
|
||||
i = 0
|
||||
while i < len(split_sentences):
|
||||
sentence = split_sentences[i].strip()
|
||||
if re.search(keywords, sentence, re.IGNORECASE):
|
||||
follow_up_present = any(
|
||||
re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
|
||||
if follow_up_present:
|
||||
# 如果存在后续关键词,则从当前位置开始截取
|
||||
start_index = i
|
||||
end_index = start_index
|
||||
found_next_section = False
|
||||
for j in range(start_index + 1, len(split_sentences)):
|
||||
if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
|
||||
end_index = j
|
||||
found_next_section = True
|
||||
break
|
||||
|
||||
if found_next_section:
|
||||
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
||||
else:
|
||||
full_text = ' '.join(split_sentences[start_index:]).strip()
|
||||
# pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||
# data=re.sub(pattern,'',full_text)
|
||||
data = re.sub(pattern, '', full_text).replace(' ','').strip()
|
||||
sentences2.append(data) # 存储有后续关键词的情况
|
||||
i = end_index if found_next_section else len(split_sentences)
|
||||
else:
|
||||
# pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||
# data = re.sub(pattern, '', sentence).replace('\n','').strip()
|
||||
cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip()
|
||||
if len(cleaned_sentence) > 8:
|
||||
sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return sentences1, sentences2 # 返回两个列表
|
||||
|
||||
def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
|
||||
if not json_path:
|
||||
return [],[]
|
||||
with open(json_path, 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
"""从JSON数据中提取包含关键词的句子。"""
|
||||
return find_sentences_with_keywords(data, keywords,follow_up_keywords)
|
||||
|
||||
#处理无效投标
|
||||
def extract_values_if_contains(data, includes):
|
||||
"""
|
||||
递归检查字典中的值是否包含列表 'includes' 中的内容。
|
||||
如果包含,将这些值添加到一个列表中并返回。
|
||||
|
||||
参数:
|
||||
data (dict): 字典或从 JSON 解析得到的数据。
|
||||
includes (list): 包含要检查的关键词的列表。
|
||||
|
||||
返回:
|
||||
list: 包含满足条件的值的列表。
|
||||
"""
|
||||
included_values = [] # 初始化结果列表
|
||||
|
||||
# 定义递归函数来处理嵌套字典
|
||||
def recursive_search(current_data):
|
||||
if isinstance(current_data, dict):
|
||||
for key, value in current_data.items():
|
||||
if isinstance(value, dict):
|
||||
# 如果值是字典,递归搜索
|
||||
recursive_search(value)
|
||||
elif isinstance(value, str):
|
||||
# 如果值是字符串,检查是否包含任何 includes 中的关键词
|
||||
if any(include in value for include in includes):
|
||||
included_values.append(value)
|
||||
elif isinstance(current_data, list):
|
||||
for item in current_data:
|
||||
# 如果是列表,递归每个元素
|
||||
recursive_search(item)
|
||||
|
||||
# 开始递归搜索
|
||||
recursive_search(data)
|
||||
|
||||
return included_values
|
||||
|
||||
|
||||
#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。
|
||||
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。
|
||||
#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
|
||||
|
||||
#TODO:truncate_json_path为空的时候,单独提取表格数据
|
||||
def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
|
||||
try:
|
||||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
|
||||
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
||||
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 提取正文(除表格)
|
||||
# print(extracted_contents)
|
||||
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) # 提取表格数据(json_data)
|
||||
qianwen_txt = all_texts1 + all_tables1
|
||||
selected_contents = set() # 使用 set 去重
|
||||
|
||||
if qianwen_txt:
|
||||
with open(output_file, 'w', encoding='utf-8') as file:
|
||||
counter = 1
|
||||
for content in qianwen_txt:
|
||||
file.write("..............." + '\n')
|
||||
file.write(f"{counter}. {content}\n")
|
||||
counter += 1
|
||||
|
||||
file_id = upload_file(output_file)
|
||||
# qianwen_ans = qianwen_long(file_id, user_query)
|
||||
qianwen_ans = qianwen_long(file_id, user_query)
|
||||
num_list = process_string_list(qianwen_ans)
|
||||
print(result_key + "选中的序号:" + str(num_list))
|
||||
|
||||
for index in num_list:
|
||||
if 1 <= index <= len(qianwen_txt):
|
||||
content = qianwen_txt[index - 1]
|
||||
selected_contents.add(content)
|
||||
|
||||
# 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
|
||||
selected_contents.update(all_texts2)
|
||||
selected_contents.update(all_tables2)
|
||||
|
||||
# 如果 selected_contents 不为空,则返回结果,否则返回空字符串
|
||||
if selected_contents:
|
||||
res = {result_key: list(selected_contents)}
|
||||
else:
|
||||
res = {result_key: ""}
|
||||
|
||||
return res
|
||||
except Exception as e:
|
||||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||||
return {result_key: ""}
|
||||
|
||||
def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification):
|
||||
tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx
|
||||
truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir) # 投标人须知前附表docx->json
|
||||
queries = [
|
||||
(
|
||||
r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||||
os.path.join(output_dir, "temp1.txt"),
|
||||
"否决和无效投标情形"
|
||||
),
|
||||
(
|
||||
r'废\s*标',
|
||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||||
os.path.join(output_dir, "temp2.txt"),
|
||||
"废标项"
|
||||
)
|
||||
]
|
||||
results = []
|
||||
# 使用线程池来并行处理查询
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = []
|
||||
for keywords, user_query, output_file, result_key in queries:
|
||||
future = executor.submit(
|
||||
handle_query,
|
||||
invalid_docpath,
|
||||
user_query,
|
||||
output_file,
|
||||
result_key,
|
||||
keywords,
|
||||
truncate_jsonpath
|
||||
)
|
||||
futures.append((future, result_key))
|
||||
time.sleep(0.5) # 暂停0.5秒后再提交下一个任务
|
||||
# 按照提交的顺序收集结果
|
||||
for future, result_key in futures:
|
||||
try:
|
||||
result = future.result()
|
||||
except Exception as e:
|
||||
print(f"线程处理 {result_key} 时出错: {e}")
|
||||
result = {result_key: ""}
|
||||
results.append(result)
|
||||
|
||||
# 禁止投标(find_forbidden)部分
|
||||
try:
|
||||
# print("starting不得存在的情形...")
|
||||
forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification)
|
||||
except Exception as e:
|
||||
print(f"find_forbidden 处理时出错: {e}")
|
||||
forbidden_res = {'不得存在的其他情形': ""}
|
||||
results.append(forbidden_res)
|
||||
|
||||
combined_dict = {}
|
||||
for d in results:
|
||||
combined_dict.update(d)
|
||||
# print("无效标与废标done...")
|
||||
return {"无效标与废标项": combined_dict}
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_time = time.time()
|
||||
tobidders_notice_table=""
|
||||
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
|
||||
qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
|
||||
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout"
|
||||
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
|
||||
invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx'
|
||||
results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification)
|
||||
end_time = time.time()
|
||||
print("Elapsed time:", str(end_time - start_time))
|
||||
print("Results:", json.dumps(results,ensure_ascii=False,indent=4))
|
@ -1,221 +0,0 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
import concurrent.futures
|
||||
from flask_app.main.基础信息整合快速版 import combine_basic_info
|
||||
from flask_app.main.资格审查模块 import combine_review_standards
|
||||
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
|
||||
from flask_app.general.docx截取docx import copy_docx
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
logger=None
|
||||
|
||||
# 创建全局线程池
|
||||
executor = ThreadPoolExecutor()
|
||||
def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
|
||||
logger.info("starting 文件预处理...")
|
||||
logger.info("output_folder..." + output_folder)
|
||||
|
||||
# 根据文件类型处理文件路径
|
||||
if file_type == 1: # docx
|
||||
docx_path = downloaded_file_path
|
||||
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
||||
elif file_type == 2: # pdf
|
||||
pdf_path = downloaded_file_path
|
||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||
elif file_type == 3: #doc
|
||||
pdf_path=docx2pdf(downloaded_file_path)
|
||||
docx_path=doc2docx(downloaded_file_path)
|
||||
else:
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id)
|
||||
|
||||
# 处理各个部分
|
||||
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
|
||||
|
||||
invalid_path=truncate_files[5]
|
||||
invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
|
||||
# invalid_docpath=pdf2docx(invalid_path)
|
||||
truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json
|
||||
truncate0 = truncate_files[0]
|
||||
truncate1 = truncate_files[1]
|
||||
truncate2=truncate_files[2]
|
||||
truncate3 = truncate_files[3]
|
||||
merged_baseinfo_path=truncate_files[-1]
|
||||
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
|
||||
|
||||
logger.info("文件预处理done")
|
||||
|
||||
# 返回包含预处理后文件路径的字典
|
||||
return {
|
||||
'file_path': downloaded_file_path,
|
||||
'output_folder': output_folder,
|
||||
'invalid_path':invalid_path,
|
||||
'truncate0': truncate0,
|
||||
'truncate1': truncate1,
|
||||
'truncate2':truncate2,
|
||||
'truncate3': truncate3,
|
||||
'truncate0_jsonpath': truncate_jsonpath,
|
||||
'merged_baseinfo_path':merged_baseinfo_path,
|
||||
'clause_path': clause_path,
|
||||
'invalid_docpath': invalid_docpath
|
||||
}
|
||||
|
||||
|
||||
def post_processing(data,includes):
|
||||
# 初始化结果字典,预设'其他'分类为空字典
|
||||
result = {"其他": {}}
|
||||
|
||||
# 遍历原始字典的每一个键值对
|
||||
for key, value in data.items():
|
||||
if key in includes:
|
||||
# 如果键在includes列表中,直接保留这个键值对
|
||||
result[key] = value
|
||||
else:
|
||||
# 如果键不在includes列表中,将这个键值对加入到'其他'分类中
|
||||
result["其他"][key] = value
|
||||
|
||||
# 如果'其他'分类没有任何内容,可以选择删除这个键
|
||||
if not result["其他"]:
|
||||
del result["其他"]
|
||||
|
||||
return result
|
||||
# 基本信息
|
||||
def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path): # 投标人须知前附表
|
||||
logger.info("starting基础信息...")
|
||||
basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path)
|
||||
logger.info("基础信息done")
|
||||
return basic_res
|
||||
|
||||
|
||||
# 形式、响应、资格评审
|
||||
def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path):
|
||||
logger.info("starting资格审查...")
|
||||
review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath,
|
||||
clause_path,invalid_path,merged_baseinfo_path)
|
||||
logger.info("资格审查done")
|
||||
return review_standards_res
|
||||
|
||||
|
||||
# 评分细则 流式
|
||||
def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
||||
logger.info("starting 商务标和技术标...")
|
||||
|
||||
# 获取评标办法前附表的字典结果
|
||||
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
||||
|
||||
# 获取技术标和商务标
|
||||
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
|
||||
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
|
||||
|
||||
logger.info("商务标和技术标 done")
|
||||
|
||||
# 返回将 "技术标" 和 "商务标" 包含在新的键中
|
||||
return {
|
||||
"technical_standards": technical_standards,
|
||||
"commercial_standards": commercial_standards
|
||||
}
|
||||
|
||||
|
||||
# 无效、废标项解析
|
||||
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
|
||||
# 废标项要求:千问
|
||||
logger.info("starting无效标与废标...")
|
||||
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
|
||||
logger.info("无效标与废标done...")
|
||||
return find_invalid_res
|
||||
|
||||
|
||||
# 投标文件要求
|
||||
def fetch_bidding_documents_requirements(clause_path):
|
||||
logger.info("starting投标文件要求...")
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
||||
logger.info("投标文件要求done...")
|
||||
return {"投标文件要求":fetch_bidding_documents_requirements_json}
|
||||
|
||||
# 开评定标流程
|
||||
def fetch_bid_opening(clause_path):
|
||||
logger.info("starting开评定标流程...")
|
||||
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
||||
logger.info("开评定标流程done...")
|
||||
return {"开评定标流程":fetch_bid_opening_json}
|
||||
|
||||
#分段返回
|
||||
def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id):
|
||||
global logger
|
||||
logger = get_global_logger(unique_id)
|
||||
# 预处理文件,获取处理后的数据
|
||||
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type)
|
||||
if not processed_data:
|
||||
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# 立即启动不依赖 knowledge_name 和 index 的任务
|
||||
futures = {
|
||||
'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'],
|
||||
processed_data['truncate0'],
|
||||
processed_data['truncate2'], processed_data['clause_path']),
|
||||
'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
|
||||
processed_data['truncate3'], output_folder,
|
||||
processed_data['truncate0_jsonpath'],
|
||||
processed_data['clause_path'], processed_data['invalid_path'],
|
||||
processed_data['merged_baseinfo_path']),
|
||||
'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
|
||||
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
|
||||
output_folder, processed_data['truncate0_jsonpath'],
|
||||
processed_data['clause_path'], processed_data['truncate3']),
|
||||
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']),
|
||||
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
|
||||
}
|
||||
|
||||
# 提前处理这些不依赖的任务,按完成顺序返回
|
||||
for future in concurrent.futures.as_completed(futures.values()):
|
||||
key = next(k for k, v in futures.items() if v == future)
|
||||
try:
|
||||
result = future.result()
|
||||
|
||||
# 如果是 evaluation_standards,拆分技术标和商务标
|
||||
if key == 'evaluation_standards':
|
||||
technical_standards = result["technical_standards"]
|
||||
commercial_standards = result["commercial_standards"]
|
||||
|
||||
# 分别返回技术标和商务标
|
||||
yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False)
|
||||
yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False)
|
||||
|
||||
else:
|
||||
# 处理其他任务的结果
|
||||
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
|
||||
|
||||
except Exception as exc:
|
||||
logger.error(f"Error processing {key}: {exc}")
|
||||
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
|
||||
file_type = 2 #1:docx 2:pdf 3:其他
|
||||
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
|
||||
print("yes")
|
||||
for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
|
||||
print(output)
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time # 计算耗时
|
||||
print(f"Function execution took {elapsed_time} seconds.")
|
@ -1,42 +0,0 @@
|
||||
import os
|
||||
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.old_version.形式响应评审old import process_reviews
|
||||
from flask_app.old_version.资格评审old import process_qualification
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
||||
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
|
||||
# 形式评审、响应评审:千问
|
||||
file_id=upload_file(truncate1) #评标办法前附表
|
||||
user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
|
||||
results = qianwen_long(file_id, user_query_1)
|
||||
original_dict_data = extract_content_from_json(results)
|
||||
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容
|
||||
with ThreadPoolExecutor() as executor:
|
||||
# 创建Future对象
|
||||
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
|
||||
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
|
||||
clause_path,input_file,output_folder)
|
||||
|
||||
# 等待执行结果
|
||||
final_qualify_json = future_qualification.result()
|
||||
form_response_dict = future_form_response.result()
|
||||
form_response_dict.update(final_qualify_json)
|
||||
# return nest_json_under_key(form_response_dict,"资格审查")
|
||||
return {"资格审查":form_response_dict}
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf"
|
||||
output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21"
|
||||
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
|
||||
truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf")
|
||||
knowledge_name="zbtest20"
|
||||
truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
|
||||
truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf")
|
||||
clause_path = convert_clause_to_json(truncate2, output_folder)
|
||||
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
||||
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
|
||||
print(res)
|
@ -1,182 +0,0 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
# 资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典
|
||||
import json
|
||||
import re
|
||||
from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json
|
||||
from flask_app.general.多线程提问 import multi_threading, read_questions_from_file
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
|
||||
|
||||
def merge_dictionaries_under_common_key(dicts, common_key):
|
||||
# 初始化一个空字典来保存合并的结果
|
||||
merged_dict = {common_key: {}}
|
||||
|
||||
# 遍历列表中的每个字典
|
||||
for d in dicts:
|
||||
if common_key in d:
|
||||
# 使用字典解包来合并字典
|
||||
merged_dict[common_key].update(d[common_key])
|
||||
else:
|
||||
print(f"资格评审: Warning: Dictionary does not contain the key {common_key}")
|
||||
|
||||
return merged_dict
|
||||
|
||||
|
||||
def generate_qual_question(matching_keys_list): # 这里假设资质、信誉与人员要求 要不都有、要不都没
|
||||
if not matching_keys_list:
|
||||
return []
|
||||
else:
|
||||
questions = []
|
||||
# 将列表转换为单引号包裹的格式,并用逗号和空格分隔
|
||||
formatted_keys = ["'{}'".format(key) for key in matching_keys_list]
|
||||
# 将格式化后的关键词列表连接成字符串
|
||||
keys_string = "、".join(formatted_keys)
|
||||
# 构造完整的问题语句
|
||||
question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string},"
|
||||
"请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。")
|
||||
question2 = "该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则无需返回该键值对。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。"
|
||||
questions.append(question1)
|
||||
questions.append(question2)
|
||||
return questions
|
||||
|
||||
|
||||
def extract_matching_keys_qual(dict_data):
|
||||
# 定义包含模式的列表
|
||||
include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),
|
||||
re.compile(r"符合")]
|
||||
# 初始化列表,用于存储匹配的键
|
||||
matching_keys = []
|
||||
non_matching_keys = {}
|
||||
# 定义排除项
|
||||
excludes = ['联合体', '禁止投标', '不存在', '不得存在', '资格', '管理机构', '负责人', '人员'] # 联合体、禁止投标的情况、人员需要额外问
|
||||
# 遍历字典中的每个键值对
|
||||
for key, value in dict_data.items():
|
||||
if "附件" in key and "资质" in key:
|
||||
return [], [] # 如果同时出现,立即返回空列表
|
||||
# 检查值是否符合任何一个包含模式
|
||||
if any(pattern.search(value) for pattern in include_patterns):
|
||||
# 如果值符合包含模式,再检查键是否包含任何排除项
|
||||
if not any(ex in key for ex in excludes):
|
||||
# 如果键不包含排除项,则添加到匹配键列表中
|
||||
matching_keys.append(key)
|
||||
else:
|
||||
# value中有实质的内容,不需要额外问。
|
||||
non_matching_keys[key] = value
|
||||
|
||||
return matching_keys, non_matching_keys # matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
|
||||
|
||||
|
||||
# 获取联合体投标的要求,由于它不一定在资格审查表中,故调用rag
|
||||
def get_consortium_dict(knowledge_name):
|
||||
qualify_list = []
|
||||
consortium_questions = [
|
||||
"该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)',嵌套键名为你对该要求的总结,而键值需要完全与原文保持一致,不要擅自总结、删减。"]
|
||||
results1 = multi_threading(consortium_questions, knowledge_name)
|
||||
for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||
try:
|
||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||
qualify_list.append(response[1])
|
||||
else:
|
||||
print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
||||
except Exception as e:
|
||||
print(f"资格评审: Error processing response for query index {_}: {e}")
|
||||
consortium_dict = combine_json_results(qualify_list)
|
||||
return consortium_dict
|
||||
|
||||
|
||||
def get_all_dict(knowledge_name):
|
||||
qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt'
|
||||
questions = read_questions_from_file(qualification_review_file_path)
|
||||
qualification_list = []
|
||||
res1 = multi_threading(questions, knowledge_name)
|
||||
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||
try:
|
||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||
qualification_list.append(response[1])
|
||||
else:
|
||||
print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
||||
except Exception as e:
|
||||
print(f"资格评审: Error processing response for query index {_}: {e}")
|
||||
qualification_combined_res = combine_json_results(qualification_list)
|
||||
return {'资格评审': qualification_combined_res}
|
||||
|
||||
|
||||
def process_qualification(qualification_review, truncate3, knowledge_name):
|
||||
# 资格评审
|
||||
matching_keys_list, non_matching_dict = extract_matching_keys_qual(
|
||||
qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
|
||||
if not matching_keys_list:
|
||||
if not non_matching_dict: # 古法提取
|
||||
if truncate3 != "": # 提取到资格审查附件的情况
|
||||
print("资格评审: type1")
|
||||
matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"]
|
||||
ques = generate_qual_question(matching_keys_list)
|
||||
file_id2 = upload_file(truncate3)
|
||||
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||
res_list = []
|
||||
if not results2:
|
||||
print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
||||
else:
|
||||
# 打印结果
|
||||
for question, response in results2:
|
||||
res_list.append(clean_json_string(response)) # 都是问资格评审表得出的
|
||||
if res_list:
|
||||
merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
|
||||
consortium_dict = get_consortium_dict(knowledge_name)
|
||||
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)
|
||||
return updated_qualify_json
|
||||
else:
|
||||
print("资格评审: 无法获取大模型结果,返回空值")
|
||||
return {"资格评审": ""}
|
||||
else:
|
||||
print("资格评审: type2")
|
||||
return get_all_dict(knowledge_name) or {"资格评审": ""}
|
||||
|
||||
else: # 此时要求全部写在评分办法前附表中,不需要额外提取。
|
||||
print("资格评审: type3")
|
||||
new_non_matching_json = {'资格评审': non_matching_dict}
|
||||
substring = '联合体'
|
||||
found_key = any(substring in key for key in non_matching_dict.keys()) # 没有联合体投标,则需生成,防止重复
|
||||
if not found_key:
|
||||
consortium_dict = get_consortium_dict(knowledge_name)
|
||||
final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict)
|
||||
return final_qualify_json
|
||||
else:
|
||||
return new_non_matching_json or {"资格评审": ""}
|
||||
|
||||
elif matching_keys_list and truncate3 == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
|
||||
print("资格评审: type4")
|
||||
final_qualification = get_all_dict(knowledge_name)
|
||||
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
|
||||
return final_qualify_json or {"资格评审": ""}
|
||||
else: # 大多数情况
|
||||
print("资格评审: type5")
|
||||
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
||||
file_id2 = upload_file(truncate3)
|
||||
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||
res_list = []
|
||||
if not results2:
|
||||
print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
||||
return {"资格评审": ""}
|
||||
else:
|
||||
# 打印结果
|
||||
for question, response in results2:
|
||||
cleaned_res = clean_json_string(response)
|
||||
res_list.append(cleaned_res) # 都是问资格评审表得出的
|
||||
merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
|
||||
consortium_dict = get_consortium_dict(knowledge_name)
|
||||
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
|
||||
final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
|
||||
return final_qualify_json or {"资格评审": ""}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年(2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年(2017 年至今)须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'}
|
||||
qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证',
|
||||
'资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'}
|
||||
truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf"
|
||||
knowledge_name = "招标解析word13"
|
||||
res = process_qualification(qualification_review, truncate3, knowledge_name)
|
||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
||||
|
||||
# 该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。
|
@ -1,29 +0,0 @@
|
||||
from flask_app.general.读取文件.按页读取pdf import extract_text_by_page
|
||||
|
||||
def check_strings_in_pdf(file_path):
|
||||
judge_list=['施工机械设备', '企业信息登记']
|
||||
# Read text from PDF
|
||||
text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF
|
||||
full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text
|
||||
|
||||
# Initialize the questions list
|
||||
ques_list = []
|
||||
|
||||
# Check for each string in the judge_list and construct questions accordingly
|
||||
if judge_list[0] in full_text:
|
||||
ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[0]}',若存在未知信息,在对应的键值中填'未知'。")
|
||||
if len(judge_list) > 1 and judge_list[1] in full_text:
|
||||
ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[1]}',若存在未知信息,在对应的键值中填'未知'。")
|
||||
|
||||
if not ques_list:
|
||||
return None
|
||||
return ques_list
|
||||
|
||||
# Test cases or example usage
|
||||
if __name__ == '__main__':
|
||||
file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf' # Replace with your actual PDF file path
|
||||
judge_list = ['施工机械设备', '企业信息登记'] # List of strings to check in the PDF
|
||||
|
||||
questions = check_strings_in_pdf(file_path, judge_list)
|
||||
for question in questions:
|
||||
print(question)
|
@ -1,48 +0,0 @@
|
||||
import json
|
||||
import random
|
||||
from http import HTTPStatus
|
||||
from dashscope import Generation
|
||||
|
||||
def call_with_messages(messages):
|
||||
response = Generation.call(model="qwen-max",
|
||||
messages=messages,
|
||||
seed=random.randint(1, 10000),
|
||||
temperature=0.5,
|
||||
top_p=0.5,
|
||||
top_k=50,
|
||||
result_format='message')
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
content = response.output['choices'][0]['message']['content']
|
||||
return content
|
||||
else:
|
||||
raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}')
|
||||
|
||||
def prepare_question_from_json(json_path, user_query):
|
||||
with open(json_path, 'r', encoding='utf-8') as file:
|
||||
json_data = json.load(file)
|
||||
question = json.dumps(json_data, ensure_ascii=False) + user_query
|
||||
return question
|
||||
|
||||
#专用于判断是否
|
||||
def qianwen_ask(json_path, user_query):
|
||||
messages = []
|
||||
question = prepare_question_from_json(json_path, user_query)
|
||||
messages.append({'role': 'system', 'content': 'You are a helpful assistant.'})
|
||||
messages.append({'role': 'user', 'content': question})
|
||||
return call_with_messages(messages)
|
||||
|
||||
#通用问题
|
||||
def qianwen_ask2(questions):
|
||||
messages = []
|
||||
messages.append({'role': 'system', 'content': 'You are a helpful assistant.'})
|
||||
messages.append({'role': 'user', 'content': questions})
|
||||
return call_with_messages(messages)
|
||||
|
||||
if __name__ == '__main__':
|
||||
json_path = 'judge_exist.json'
|
||||
prompt = "请你依据以上信息回答,是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在未知或者矛盾信息,请回答'未知'。"
|
||||
try:
|
||||
content = qianwen_ask(json_path, prompt)
|
||||
print(content)
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
@ -9,7 +9,7 @@ from flask_app.routes.get_deviation import get_deviation_bp
|
||||
from flask_app.routes.little_zbparse import little_zbparse_bp
|
||||
from flask_app.routes.upload import upload_bp
|
||||
from flask_app.routes.test_zbparse import test_zbparse_bp
|
||||
|
||||
from flask_app.general.清除file_id import delete_file_by_ids,read_file_ids
|
||||
class FlaskAppWithLimiter(Flask):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -23,25 +23,6 @@ def create_app():
|
||||
handler.setFormatter(CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
||||
app.logger.addHandler(handler)
|
||||
app.logger.setLevel(logging.INFO)
|
||||
# @app.before_request
|
||||
# def before_request():
|
||||
# """
|
||||
# 每个请求开始前初始化 logger 和 output_folder,
|
||||
# 根据请求的端点选择不同的子文件夹。
|
||||
# """
|
||||
# # 确定当前请求的端点
|
||||
# blueprint = request.blueprint
|
||||
# # 映射端点到子文件夹
|
||||
# subfolder_map = {
|
||||
# 'get_deviation': 'output3',
|
||||
# 'little_zbparse': 'output2',
|
||||
# 'upload': 'output1',
|
||||
# 'test_zbparse': 'test_output'
|
||||
# }
|
||||
# # 获取对应的子文件夹,默认为 'output1'
|
||||
# subfolder = subfolder_map.get(blueprint, 'output1')
|
||||
# # 创建 logger 和 output_folder
|
||||
# create_logger(app, subfolder)
|
||||
|
||||
# 注册蓝图
|
||||
app.register_blueprint(get_deviation_bp)
|
||||
@ -58,6 +39,13 @@ def create_app():
|
||||
monitor = getattr(g, 'monitor', None)
|
||||
if limiter and not monitor.is_timeout:
|
||||
limiter.semaphore.release()
|
||||
output_folder = getattr(g, 'output_folder', None)
|
||||
if output_folder:
|
||||
# 执行与output_folder相关的清理操作(例如删除临时文件)
|
||||
logger = g.logger
|
||||
logger.info(f"正在清理输出文件夹: {output_folder}")
|
||||
file_ids=read_file_ids(output_folder)
|
||||
delete_file_by_ids(file_ids)
|
||||
return app
|
||||
|
||||
#TODO:培训要求、总体要求、进度要求、'建设要求'到技术要求中,归类到其他要求中
|
||||
|
@ -46,7 +46,6 @@ def find_exists(truncate_file, required_keys):
|
||||
start_match = re.search(begin_pattern, first_page_clean)
|
||||
if not start_match:
|
||||
print("未找到开始模式,返回完整第一页")
|
||||
start_index = 0
|
||||
first_content = first_page_clean
|
||||
else:
|
||||
start_index = start_match.end()
|
||||
@ -186,7 +185,11 @@ def generate_template(required_keys, type=1):
|
||||
"""
|
||||
)
|
||||
else:
|
||||
specific_instructions = "4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容;若内容全在表格中,键值为空列表[]。"
|
||||
specific_instructions = textwrap.dedent(
|
||||
"""4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容;若技术要求的内容全在表格中,键值为空列表[]。
|
||||
5. 在提取'技术要求'时,注意不要提取有关'售后、维护、运维、培训、质保'等要求,它们不属于'技术要求'。
|
||||
"""
|
||||
)
|
||||
return textwrap.dedent(
|
||||
f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容及序号。请不要提取{another_keys_str}中的内容。
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user