zbparse/flask_app/main/多线程分类.py

374 lines
16 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
# -*- coding: utf-8 -*-
import os
import shutil
from PyPDF2 import PdfReader, PdfWriter
def validate_pdf(file_path):
""" 验证PDF文件是否损坏 """
try:
with open(file_path, "rb") as file:
pdf = PdfReader(file)
return len(pdf.pages) > 0
except Exception as e:
print(f"Error reading PDF {file_path}: {str(e)}")
return False
def truncate_pdf(source_path, target_path, max_pages=15):
"""截取PDF文件的前15页并保存"""
try:
with open(source_path, "rb") as infile:
reader = PdfReader(infile)
writer = PdfWriter()
for i in range(min(max_pages, len(reader.pages))):
writer.add_page(reader.pages[i])
with open(target_path, "wb") as outfile:
writer.write(outfile)
except Exception as e:
print(f"Error processing PDF {source_path}: {str(e)}")
def copy_file(src, dest):
""" 复制单个文件 """
os.makedirs(os.path.dirname(dest), exist_ok=True)
shutil.copy2(src, dest)
def copy_directory(source, destination):
"""复制整个目录"""
os.makedirs(destination, exist_ok=True)
for item in os.listdir(source):
src_path = os.path.join(source, item)
dest_path = os.path.join(destination, item)
if os.path.isfile(src_path):
copy_file(src_path, dest_path)
else:
copy_directory(src_path, dest_path)
def unique_file_name(base_path, file_name):
counter = 1
name_part, extension = os.path.splitext(file_name)
new_file_name = file_name
# 检查文件是否存在,若存在则修改文件名
while os.path.exists(os.path.join(base_path, new_file_name)):
new_file_name = f"{name_part}_{counter}{extension}"
counter += 1
return new_file_name
def process_pdf_folders(source_dir, target_dir):
""" 处理源目录中的PDF文件并基于条件选择目标文件夹 """
for root, dirs, files in os.walk(source_dir, topdown=False):
for file in files:
if file.lower().endswith('.pdf'):
source_file_path = os.path.join(root, file)
if validate_pdf(source_file_path):
relative_path = os.path.relpath(root, source_dir)
target_file_dir = os.path.join(target_dir, relative_path)
target_file_path = os.path.join(target_file_dir, file)
copy_file(source_file_path, target_file_path)
else:
print(f"Deleted corrupt file: {source_file_path}")
# 清除空目录
if not os.listdir(root):
os.rmdir(root)
def classify_folders(source_dir, target_dir, project_index):
"""Classifies folders and processes files based on specific criteria."""
temp_dir = os.path.join(source_dir, 'temp')
os.makedirs(temp_dir, exist_ok=True)
target_project_dir = None
processed_dirs = set() # Set to track processed directories
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path):
continue
files = [f.lower() for f in os.listdir(subdir_path)]
if 'zb.pdf' in files:
target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
processed_dirs.add(subdir_path) # Mark this directory as processed
elif subdir.lower() == "输出文件":
process_evaluation_files(subdir_path, target_project_dir)
processed_dirs.add(subdir_path) # Mark this directory as processed
# Process remaining folders, skipping already processed ones
process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
#删除tmp目录
# if os.path.exists(temp_dir):
# shutil.rmtree(temp_dir)
if os.path.exists(source_dir):
shutil.rmtree(source_dir)
def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
"""Processes tender files and returns the target project directory and updated index."""
zb_path = os.path.join(subdir_path, "zb.pdf")
truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages
bot_response = file_parse(truncated_zb_path, project_index, 1)
zb_response = extract_answer(bot_response[1], 1)
zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠
target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
new_zb_path = os.path.join(subdir_path, target_zb_name)
os.rename(zb_path, new_zb_path)
target_category_dir = os.path.join(target_dir, category)
target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
os.remove(truncated_zb_path)
shutil.rmtree(subdir_path)
return target_project_dir, zb_response[2]
def process_evaluation_files(subdir_path, target_project_dir):
"""Processes evaluation folders."""
copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
shutil.rmtree(subdir_path)
def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
"""Processes remaining folders containing bid files."""
target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
continue # Skip processed directories
target_tbcom_dir = None # Initialize outside the file loop
for item in os.listdir(subdir_path):
item_src_path = os.path.join(subdir_path, item)
new_name = "truncate_" + item
truncated_tb_path = os.path.join(temp_dir, new_name)
truncate_pdf(item_src_path, truncated_tb_path)
bot_response = file_parse(truncated_tb_path, project_index, 2)
tb_response = extract_answer(bot_response[1], 2)
if not tb_response:
continue # If there's no response, skip this file
# Initialize target_tbcom_dir only once based on the first processed file
if not target_tbcom_dir:
target_tb_name, _ = tb_response
if(target_tb_name != target_zb_name and target_tb_name != "未知"):
target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
print(target_tbcom_dir)
os.makedirs(target_tbcom_dir, exist_ok=True)
tb_section = tb_response[1] + ".pdf"
new_tb_path = os.path.join(subdir_path, tb_section)
# 获取唯一文件名
new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
os.rename(item_src_path, new_tb_path)
# Remove temporary truncated file
os.remove(truncated_tb_path)
# Copy the whole directory at once after all files have been processed and renamed
if target_tbcom_dir:
copy_directory(subdir_path, target_tbcom_dir)
shutil.rmtree(subdir_path) # Optionally remove the original directory after copying
import re
def extract_answer(input_string, type):
# 使用正则表达式匹配所需的信息
# 第一种模式:项目名称、项目编号、招标人、类别
# 在每个字段值后都添加了\s*以忽略尾随的空格
pattern1 = r"项目名称[:]\s*(.*?)[;]\s*项目编号[:]\s*(.*?)[;]\s*招标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
# 第二种模式:投标人、类别
pattern2 = r"投标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
if type == 1:
match = re.search(pattern1, input_string)
if match:
print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
else:
print("No match found for type 1.")
return []
elif type == 2:
match = re.search(pattern2, input_string)
if match:
# 检查是否包含“投标函”,如果是则替换部分内容为“商务文件”
part = match.group(2).strip()
if "投标函" in part:
part = "商务文件"
return [match.group(1).strip(), part]
else:
print("No match found for type 2.")
return []
from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
from dashscope import Assistants, Messages, Runs, Threads
def send_message(assistant, index,documents,message='百炼是什么?'):
print(f"Query: {message}")
# create thread.
# create a thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
print("run:" + str(run))
# # get run statue
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
# print(run_status)
# if prompt input tool result, submit tool result.
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# verify_status_code(run_status)
# get the thread messages.
msgs = Messages.list(thread.id)
# print(msgs)
# print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
ans = []
print("运行结果:")
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
print("content: ", message['content'][0]['text']['value'])
print("\n")
deleteFileFromKnowledge(index,documents)
return ans
def file_parse(filepath, knowledge_index, type):
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
documents = parse.load_data(file_path=filepath)
# 创建一个字典来映射index值到知识库名
index_to_name = {
0: "文件分类知识库0",
1: "文件分类知识库1",
2: "文件分类知识库2",
3: "文件分类知识库3"
}
# 使用get方法获取对应的知识库名如果index不存在返回"默认文件分类知识库"
cloud_index_name = index_to_name.get(knowledge_index, "0")
index = DashScopeCloudIndex(cloud_index_name)
index._insert(documents)
retriever = index.as_retriever()
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-plus',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}',
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${document1}"
}
}
}
}
}]
)
questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\
货物类招投标是指以货物作为采购对象的招标业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方包括原材料产品设备电能和固态液态气态物体等服务类招标又称为服务采购指的是除了工程和货之外的其他招标投标活动服物招标投标范围包含建设工程的勘察设计监理工程咨询评估科技项目科研课题物业管理金融保险服务等\
请按下列键值对格式给我提供信息,避免无关内容:项目名称:XXX;项目编号:XXX;招标人:XXX;类别XXX"
questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX"
if (type == 1):
questions = questions1
elif (type == 2):
questions = questions2
return send_message(assistant,index,documents, message=questions)
def deleteFileFromKnowledge(index,documents):
# 初始化一个列表来存储所有文档的 ID
file_ids = []
# 检查documents是否为列表且不为空
if isinstance(documents, list) and documents:
# 遍历每个文档
for document in documents:
# 使用属性访问方式获取每个文档的 id_
# 确保 document 对象有一个名为 id_ 的属性
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
if file_id:
file_ids.append(file_id) # 将 id 添加到列表中
index.delete_ref_doc(file_ids)
import concurrent.futures
import logging
# 配置日志,只输出到控制台
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index):
logging.info(f"Starting to process directory: {subdir_path} with index {project_index}")
# 为每个子目录创建一个专用的临时目录
intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}")
os.makedirs(intermediate_directory, exist_ok=True)
logging.info(f"Created intermediate directory: {intermediate_directory}")
# 这里可以添加具体的目录处理函数,例如:
process_pdf_folders(subdir_path, intermediate_directory)
classify_folders(intermediate_directory, final_directory, project_index % 4)
# 处理完毕后清理该目录
shutil.rmtree(intermediate_directory)
logging.info(f"Deleted intermediate directory: {intermediate_directory}")
def main(base_directory, base_intermediate_directory, final_directory):
os.makedirs(final_directory, exist_ok=True)
logging.info(f"Final directory ensured at: {final_directory}")
project_index = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for subdir in os.listdir(base_directory):
subdir_path = os.path.join(base_directory, subdir)
if os.path.isdir(subdir_path):
logging.info(f"Submitting job for directory: {subdir_path}")
future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index)
futures.append(future)
project_index += 1
for future in concurrent.futures.as_completed(futures):
try:
future.result() # 如果执行没有抛出异常,完成该任务
except Exception as e:
logging.error(f"Thread resulted in an error: {e}")
if __name__ == "__main__":
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
base_intermediate_directory = 'D:\\tmp'
final_directory = 'D:\\output'
main(base_directory, base_intermediate_directory, final_directory)