zbparse/flask_app/main/文件分类普通版.py
2024-08-29 16:37:09 +08:00

351 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import os
import shutil
from PyPDF2 import PdfReader, PdfWriter
def validate_pdf(file_path):
""" 验证PDF文件是否损坏 """
try:
with open(file_path, "rb") as file:
pdf = PdfReader(file)
return len(pdf.pages) > 0
except Exception as e:
print(f"Error reading PDF {file_path}: {str(e)}")
return False
def truncate_pdf(source_path, target_path, max_pages=15):
"""截取PDF文件的前15页并保存"""
try:
with open(source_path, "rb") as infile:
reader = PdfReader(infile)
writer = PdfWriter()
for i in range(min(max_pages, len(reader.pages))):
writer.add_page(reader.pages[i])
with open(target_path, "wb") as outfile:
writer.write(outfile)
except Exception as e:
print(f"Error processing PDF {source_path}: {str(e)}")
def copy_file(src, dest):
""" 复制单个文件 """
os.makedirs(os.path.dirname(dest), exist_ok=True)
shutil.copy2(src, dest)
def copy_directory(source, destination):
"""复制整个目录"""
os.makedirs(destination, exist_ok=True)
for item in os.listdir(source):
src_path = os.path.join(source, item)
dest_path = os.path.join(destination, item)
if os.path.isfile(src_path):
copy_file(src_path, dest_path)
else:
copy_directory(src_path, dest_path)
def unique_file_name(base_path, file_name):
counter = 1
name_part, extension = os.path.splitext(file_name)
new_file_name = file_name
# 检查文件是否存在,若存在则修改文件名
while os.path.exists(os.path.join(base_path, new_file_name)):
new_file_name = f"{name_part}_{counter}{extension}"
counter += 1
return new_file_name
def process_pdf_folders(source_dir, target_dir):
""" 处理源目录中的PDF文件并基于条件选择目标文件夹 """
for root, dirs, files in os.walk(source_dir, topdown=False):
for file in files:
if file.lower().endswith('.pdf'):
source_file_path = os.path.join(root, file)
if validate_pdf(source_file_path):
relative_path = os.path.relpath(root, source_dir)
target_file_dir = os.path.join(target_dir, relative_path)
target_file_path = os.path.join(target_file_dir, file)
copy_file(source_file_path, target_file_path)
else:
print(f"Deleted corrupt file: {source_file_path}")
# 清除空目录
if not os.listdir(root):
os.rmdir(root)
def classify_folders(source_dir, target_dir, project_index):
"""Classifies folders and processes files based on specific criteria."""
temp_dir = os.path.join(source_dir, 'temp')
os.makedirs(temp_dir, exist_ok=True)
target_project_dir = None
processed_dirs = set() # Set to track processed directories
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path):
continue
files = [f.lower() for f in os.listdir(subdir_path)]
if 'zb.pdf' in files:
target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
processed_dirs.add(subdir_path) # Mark this directory as processed
elif subdir.lower() == "输出文件":
process_evaluation_files(subdir_path, target_project_dir)
processed_dirs.add(subdir_path) # Mark this directory as processed
# Process remaining folders, skipping already processed ones
process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
#删除tmp目录
# if os.path.exists(temp_dir):
# shutil.rmtree(temp_dir)
if os.path.exists(source_dir):
shutil.rmtree(source_dir)
def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
"""Processes tender files and returns the target project directory and updated index."""
zb_path = os.path.join(subdir_path, "zb.pdf")
truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages
bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1)
project_index += 1
zb_response = extract_answer(bot_response[1], 1)
zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠
target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
new_zb_path = os.path.join(subdir_path, target_zb_name)
os.rename(zb_path, new_zb_path)
target_category_dir = os.path.join(target_dir, category)
target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
os.remove(truncated_zb_path)
shutil.rmtree(subdir_path)
return target_project_dir, project_index,zb_response[2]
def process_evaluation_files(subdir_path, target_project_dir):
"""Processes evaluation folders."""
copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
shutil.rmtree(subdir_path)
def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
"""Processes remaining folders containing bid files."""
target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
continue # Skip processed directories
target_tbcom_dir = None # Initialize outside the file loop
for item in os.listdir(subdir_path):
item_src_path = os.path.join(subdir_path, item)
new_name = "truncate_" + item
truncated_tb_path = os.path.join(temp_dir, new_name)
truncate_pdf(item_src_path, truncated_tb_path)
bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2)
project_index += 1
tb_response = extract_answer(bot_response[1], 2)
if not tb_response:
continue # If there's no response, skip this file
# Initialize target_tbcom_dir only once based on the first processed file
if not target_tbcom_dir:
target_tb_name, _ = tb_response
if(target_tb_name != target_zb_name and target_tb_name != "未知"):
target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
print(target_tbcom_dir)
os.makedirs(target_tbcom_dir, exist_ok=True)
tb_section = tb_response[1] + ".pdf"
new_tb_path = os.path.join(subdir_path, tb_section)
# 获取唯一文件名
new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
os.rename(item_src_path, new_tb_path)
# Remove temporary truncated file
os.remove(truncated_tb_path)
# Copy the whole directory at once after all files have been processed and renamed
if target_tbcom_dir:
copy_directory(subdir_path, target_tbcom_dir)
shutil.rmtree(subdir_path) # Optionally remove the original directory after copying
import re
import re
def extract_answer(input_string, type):
# 使用正则表达式匹配所需的信息
# 第一种模式:项目名称、项目编号、招标人、类别
# 在每个字段值后都添加了\s*以忽略尾随的空格
pattern1 = r"项目名称[:]\s*(.*?)[;]\s*项目编号[:]\s*(.*?)[;]\s*招标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
# 第二种模式:投标人、类别
pattern2 = r"投标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
if type == 1:
match = re.search(pattern1, input_string)
if match:
print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
else:
print("No match found for type 1.")
return []
elif type == 2:
match = re.search(pattern2, input_string)
if match:
# 检查是否包含“投标函”,如果是则替换部分内容为“商务文件”
part = match.group(2).strip()
if "投标函" in part:
part = "商务文件"
return [match.group(1).strip(), part]
else:
print("No match found for type 2.")
return []
from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
from dashscope import Assistants, Messages, Runs, Threads
def send_message(assistant, index,documents,message='百炼是什么?'):
print(f"Query: {message}")
# create thread.
# create a thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
print("run:" + str(run))
# # get run statue
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
# print(run_status)
# if prompt input tool result, submit tool result.
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# verify_status_code(run_status)
# get the thread messages.
msgs = Messages.list(thread.id)
# print(msgs)
# print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
ans = []
print("运行结果:")
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
print("content: ", message['content'][0]['text']['value'])
print("\n")
deleteFileFromKnowledge(index,documents)
return ans
def file_parse(filepath, knowledge_index, type):
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
documents = parse.load_data(file_path=filepath)
index = DashScopeCloudIndex("文件分类临时知识库")
index._insert(documents)
retriever = index.as_retriever()
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}',
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${document1}"
}
}
}
}
}]
)
questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\
货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\
请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别XXX"
questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX"
if (type == 1):
questions = questions1
elif (type == 2):
questions = questions2
return send_message(assistant,index,documents, message=questions)
def deleteFileFromKnowledge(index,documents):
# 初始化一个列表来存储所有文档的 ID
file_ids = []
# 检查documents是否为列表且不为空
if isinstance(documents, list) and documents:
# 遍历每个文档
for document in documents:
# 使用属性访问方式获取每个文档的 id_
# 确保 document 对象有一个名为 id_ 的属性
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
if file_id:
file_ids.append(file_id) # 将 id 添加到列表中
index.delete_ref_doc(file_ids)
def process_directory(base_directory, intermediate_directory, final_directory):
# 遍历base_directory下的所有子目录
for subdir in os.listdir(base_directory):
subdir_path = os.path.join(base_directory, subdir)
# 确保是目录
if os.path.isdir(subdir_path):
# 处理每个项目的目录
process_pdf_folders(subdir_path, intermediate_directory)
classify_folders(intermediate_directory, final_directory, 0)
# 清理临时目录以备下一个项目使用
if os.path.exists(intermediate_directory):
shutil.rmtree(intermediate_directory)
def main(base_directory, intermediate_directory, final_directory):
# 确保中间目录和最终目录存在
os.makedirs(intermediate_directory, exist_ok=True)
os.makedirs(final_directory, exist_ok=True)
process_directory(base_directory, intermediate_directory, final_directory)
#TODO后期可加多线程进行处理
if __name__ == "__main__":
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
intermediate_directory = 'D:\\tmp'
final_directory = 'D:\\output'
main(base_directory, intermediate_directory, final_directory) #处理多级目录
# process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录
# classify_folders(intermediate_directory, final_directory, 0)