From 518f5d7ac7441c7b383950dac5c539915a5bd418 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 18 Feb 2025 14:21:34 +0800 Subject: [PATCH] =?UTF-8?q?2.18=20=E7=A9=BA=E7=9A=84docx=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E8=A2=AB=E8=AF=86=E5=88=AB=E4=B8=BA=E9=9D=9E=E6=8B=9B=E6=A0=87?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/routes/判断是否是招标文件.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/flask_app/routes/判断是否是招标文件.py b/flask_app/routes/判断是否是招标文件.py index 6fce979..94d7b83 100644 --- a/flask_app/routes/判断是否是招标文件.py +++ b/flask_app/routes/判断是否是招标文件.py @@ -1,9 +1,9 @@ import time from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2 +from docx import Document from flask_app.general.llm.通义千问long import upload_file, qianwen_long - def judge_zbfile_exec(file_path): """ 判断文件是否属于招标文件,并返回结果。 @@ -18,7 +18,23 @@ def judge_zbfile_exec(file_path): num_pages = len(reader.pages) if num_pages <= 5: return False - # 模拟使用大模型进行判断 + elif file_path.lower().endswith('.docx'): + doc = Document(file_path) + accumulated_text = "" + chunk_size = 10 # 每次读取10个段落 + paragraphs = doc.paragraphs + + for i in range(0, len(paragraphs), chunk_size): + chunk = paragraphs[i:i + chunk_size] + for para in chunk: + accumulated_text += para.text + # 判断累计字符数是否已经达到1000字, + if len(accumulated_text) >= 1000: + break + # 若累计内容不足1000字,则直接返回False + if len(accumulated_text) < 1000: + return False + # 使用大模型进行判断 user_query = """该文件是否属于招标文件?如果是的话,请返回'是',如果不是的话,返回'否'。请不要返回其他解释或内容。 以下是常见的招标文件类型: 公开招标文件、邀请招标文件、竞争性谈判文件、竞争性磋商文件、询价文件、问询文件、货物类招标文件、工程类招标文件、施工类招标文件、服务类招标文件、比选文件。 @@ -42,7 +58,7 @@ def judge_zbfile_exec(file_path): if __name__ == '__main__': start_time = time.time() - pdf_path = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\乱码文件测试.doc" + pdf_path = r"C:\Users\Administrator\Downloads\file1739842556194.docx" res = judge_zbfile_exec(pdf_path) if res: print("yes")