2.18 空的docx无法被识别为非招标文件

2025-02-18 14:21:34 +08:00 · 2025-02-18 14:21:34 +08:00 · 518f5d7ac7
commit 518f5d7ac7
parent bd8addd788
1 changed files with 19 additions and 3 deletions
--- a/flask_app/routes/判断是否是招标文件.py
+++ b/flask_app/routes/判断是否是招标文件.py
@ -1,9 +1,9 @@
 import time
 from PyPDF2 import PdfReader  # 确保已安装 PyPDF2: pip install PyPDF2
+from docx import Document

 from flask_app.general.llm.通义千问long import upload_file, qianwen_long

-
 def judge_zbfile_exec(file_path):
    """
    判断文件是否属于招标文件，并返回结果。
@ -18,7 +18,23 @@ def judge_zbfile_exec(file_path):
                num_pages = len(reader.pages)
            if num_pages <= 5:
                return False
-        # 模拟使用大模型进行判断
+        elif file_path.lower().endswith('.docx'):
+            doc = Document(file_path)
+            accumulated_text = ""
+            chunk_size = 10  # 每次读取10个段落
+            paragraphs = doc.paragraphs
+
+            for i in range(0, len(paragraphs), chunk_size):
+                chunk = paragraphs[i:i + chunk_size]
+                for para in chunk:
+                    accumulated_text += para.text
+                # 判断累计字符数是否已经达到1000字,
+                if len(accumulated_text) >= 1000:
+                    break
+            # 若累计内容不足1000字，则直接返回False
+            if len(accumulated_text) < 1000:
+                return False
+        # 使用大模型进行判断
        user_query = """该文件是否属于招标文件？如果是的话，请返回'是',如果不是的话，返回'否'。请不要返回其他解释或内容。
        以下是常见的招标文件类型：
            公开招标文件、邀请招标文件、竞争性谈判文件、竞争性磋商文件、询价文件、问询文件、货物类招标文件、工程类招标文件、施工类招标文件、服务类招标文件、比选文件。
@ -42,7 +58,7 @@ def judge_zbfile_exec(file_path):

 if __name__ == '__main__':
    start_time = time.time()
-    pdf_path = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\乱码文件测试.doc"
+    pdf_path = r"C:\Users\Administrator\Downloads\file1739842556194.docx"
    res = judge_zbfile_exec(pdf_path)
    if res:
        print("yes")