2025-01-10 14:30:35 +08:00
|
|
|
|
import time
|
|
|
|
|
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
|
2025-02-18 14:21:34 +08:00
|
|
|
|
from docx import Document
|
2025-02-19 09:41:38 +08:00
|
|
|
|
import fitz
|
2025-02-12 14:55:35 +08:00
|
|
|
|
from flask_app.general.llm.通义千问long import upload_file, qianwen_long
|
2025-01-10 14:30:35 +08:00
|
|
|
|
|
|
|
|
|
def judge_zbfile_exec(file_path):
|
|
|
|
|
"""
|
|
|
|
|
判断文件是否属于招标文件,并返回结果。
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
# 检查文件是否为PDF格式
|
|
|
|
|
if file_path.lower().endswith('.pdf'):
|
2025-02-19 09:41:38 +08:00
|
|
|
|
try:
|
|
|
|
|
with open(file_path, 'rb') as f:
|
|
|
|
|
reader = PdfReader(f)
|
|
|
|
|
num_pages = len(reader.pages)
|
|
|
|
|
except Exception:
|
|
|
|
|
try:
|
|
|
|
|
doc = fitz.open(file_path)
|
|
|
|
|
num_pages = len(doc)
|
|
|
|
|
except Exception:
|
|
|
|
|
print("PDF 文件读取失败")
|
|
|
|
|
return False # 两种解析方式都失败,直接返回 False
|
|
|
|
|
|
2025-01-10 14:30:35 +08:00
|
|
|
|
if num_pages <= 5:
|
2025-02-19 09:41:38 +08:00
|
|
|
|
return False # 小于等于 5 页的 PDF 直接判定为非招标文件
|
2025-02-18 14:21:34 +08:00
|
|
|
|
elif file_path.lower().endswith('.docx'):
|
2025-02-19 09:41:38 +08:00
|
|
|
|
try:
|
|
|
|
|
doc = Document(file_path)
|
|
|
|
|
accumulated_text = ""
|
|
|
|
|
chunk_size = 10 # 每次读取10个段落
|
|
|
|
|
paragraphs = doc.paragraphs
|
2025-02-18 14:21:34 +08:00
|
|
|
|
|
2025-02-19 09:41:38 +08:00
|
|
|
|
for i in range(0, len(paragraphs), chunk_size):
|
|
|
|
|
chunk = paragraphs[i:i + chunk_size]
|
|
|
|
|
for para in chunk:
|
|
|
|
|
accumulated_text += para.text
|
|
|
|
|
if len(accumulated_text) >= 1000:
|
|
|
|
|
break # 读取超过1000字后即可停止
|
|
|
|
|
if len(accumulated_text) < 1000:
|
|
|
|
|
return False # 若累计内容不足1000字,则直接返回 False
|
|
|
|
|
except Exception:
|
|
|
|
|
print("DOCX 文件读取失败,可能为乱码文件")
|
|
|
|
|
return False # 解析失败直接返回 False
|
2025-02-18 14:21:34 +08:00
|
|
|
|
# 使用大模型进行判断
|
2025-01-10 14:30:35 +08:00
|
|
|
|
user_query = """该文件是否属于招标文件?如果是的话,请返回'是',如果不是的话,返回'否'。请不要返回其他解释或内容。
|
|
|
|
|
以下是常见的招标文件类型:
|
|
|
|
|
公开招标文件、邀请招标文件、竞争性谈判文件、竞争性磋商文件、询价文件、问询文件、货物类招标文件、工程类招标文件、施工类招标文件、服务类招标文件、比选文件。
|
|
|
|
|
若有未涵盖的类型,但其内容明确表达了项目需求、采购或招标信息,且包含指导投标人参与的关键要素,则可视为招标文件。
|
2025-01-20 15:31:47 +08:00
|
|
|
|
排除情况:
|
|
|
|
|
1. 请注意区分招标文件和投标文件,若文件仅有投标文件格式要求部分,或是投标、响应性文件,则不视为招标文件。
|
|
|
|
|
2. 若文件内容为乱码,无有效信息,请直接返回'否'。
|
2025-01-10 14:30:35 +08:00
|
|
|
|
请基于上述内容判断文件是否属于招标文件。
|
|
|
|
|
"""
|
|
|
|
|
file_id = upload_file(file_path)
|
|
|
|
|
model_res = qianwen_long(file_id, user_query)
|
|
|
|
|
end_time = time.time()
|
|
|
|
|
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
|
|
|
|
print(f"判断是否属于招标文件:{model_res}")
|
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
return '否' not in model_res
|
2025-01-10 14:30:35 +08:00
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理文件时出错: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
start_time = time.time()
|
2025-02-19 09:41:38 +08:00
|
|
|
|
pdf_path = r"C:\Users\Administrator\Desktop\fsdownload\19f53a17-ad4c-43b5-a7ed-981958ec3e0fs\ztbfile.docx"
|
2025-01-10 14:30:35 +08:00
|
|
|
|
res = judge_zbfile_exec(pdf_path)
|
|
|
|
|
if res:
|
|
|
|
|
print("yes")
|
|
|
|
|
else:
|
|
|
|
|
print("no")
|
|
|
|
|
end_time = time.time()
|
|
|
|
|
print(f"整个程序实际耗时:{end_time - start_time:.2f} 秒")
|