From f5261087b474cd5fee1ff3a2a2a134f80caca87c Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Thu, 19 Dec 2024 14:13:49 +0800
Subject: [PATCH] =?UTF-8?q?12.19=20=E4=BF=AE=E5=A4=8D=E8=B1=86=E5=8C=85?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E4=BD=BF=E7=94=A8=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/clean_pdf.py       | 18 +++++++++++++++++-
 flask_app/general/doubao.py          | 12 ++++++++----
 flask_app/货物标/技术参数要求提取.py | 25 +++++++++++--------------
 flask_app/货物标/提取采购需求main.py | 12 +++++++++---
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py
index 0e40bf2..d55dc6d 100644
--- a/flask_app/general/clean_pdf.py
+++ b/flask_app/general/clean_pdf.py
@@ -1,5 +1,9 @@
 import re
 from PyPDF2 import PdfReader
+
+from flask_app.general.format_change import docx2pdf
+
+
 def extract_common_header(pdf_path):
 
     def get_headers(pdf_document, start_page, pages_to_read):
@@ -105,7 +109,19 @@ def is_scanned_pdf(file_path, max_pages=15):
                 return False  # 不是扫描型
     return True  # 前 max_pages 页都没有文本
 
-
+def get_pdf_page_count(file_path):
+    """
+    获取 PDF 文件的页码数量
+    """
+    try:
+        pdf_path=file_path
+        if file_path.lower().endswith(('.doc', '.docx')):
+            pdf_path = docx2pdf(file_path)
+        reader = PdfReader(pdf_path)
+        return len(reader.pages)
+    except Exception as e:
+        print(f"读取 PDF 页码时出错：{e}")
+        return 0
 if __name__ == '__main__':
     file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
     res=is_scanned_pdf(file_path)
diff --git a/flask_app/general/doubao.py b/flask_app/general/doubao.py
index b0c6b22..6d27461 100644
--- a/flask_app/general/doubao.py
+++ b/flask_app/general/doubao.py
@@ -173,16 +173,20 @@ def read_txt_to_string(file_path):
     - file_path (str): txt文件的路径
 
     返回:
-    - str: 包含文件内容的字符串
+    - str: 包含文件内容的字符串，如果出错返回空字符串 ""
     """
+    if not file_path:  # 检查文件路径是否为空
+        return ""
+
     try:
         with open(file_path, 'r', encoding='utf-8') as file:  # 确保使用适当的编码
             content = file.read()  # 使用 read() 保持文件格式
         return content
-    except FileNotFoundError:
-        return "错误：文件未找到。"
+    except (FileNotFoundError, IOError):  # 捕获文件未找到或I/O错误
+        return ""
     except Exception as e:
-        return f"错误：读取文件时发生错误。详细信息：{e}"
+        # 可根据需要记录错误日志，但返回值仍是空字符串
+        return ""
 
 @sleep_and_retry
 @limits(calls=10, period=1)  # 每秒最多调用10次
diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py
index 9500c6c..84fc462 100644
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@@ -359,27 +359,26 @@ def generate_prompt(judge_res, full_text=None):
     return base_prompt
 
 #文件内容以markdown格式组织，其中表格部分（若有）以html语法组织，
-def get_technical_requirements(invalid_path,processed_filepath):
+def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
+    judge_res=""
     file_id = ""
-    model_type = 1  # 默认使用豆包
-    first_query_template="""该文件是否说明了采购需求,即需要采购哪些内容（包括货物、设备、系统、功能模块等）?如果有,请回答'是',否则,回答'否'
-文件内容：
-{full_text}
-    """
-    judge_query = generate_full_user_query(processed_filepath, first_query_template)
-    # print(judge_query)
-    judge_res = doubao_model(judge_query)
-    if '否' in judge_res:
+    full_text = read_txt_to_string(processed_filepath)
+    if model_type:
+        first_query_template = """该文件是否说明了采购需求,即需要采购哪些内容（包括货物、设备、系统、功能模块等）?如果有,请回答'是',否则,回答'否'
+{}
+"""
+        judge_query = first_query_template.format(f"文件内容：{full_text}")
+        judge_res = doubao_model(judge_query)
+    if '否' in judge_res or model_type==0:
         model_type = 0  # 使用qianwen-long+invalid_path
         print("no!调用invalid_path")
         if invalid_path.lower().endswith('.pdf'):           #确保上传的是docx
             invalid_path = pdf2docx(invalid_path)
-        file_id=upload_file(invalid_path)
+        file_id = upload_file(invalid_path)
         user_query = generate_prompt(judge_res)
         model_res=qianwen_long(file_id,user_query)
         print(model_res)
     else:
-        full_text = read_txt_to_string(processed_filepath)
         user_query=generate_prompt(judge_res,full_text)
         model_res=doubao_model(user_query)
         print(model_res)
@@ -465,7 +464,6 @@ def get_technical_requirements(invalid_path,processed_filepath):
         modified_key = key.replace('.', '下的')
         # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
         if model_type:
-            full_text = read_txt_to_string(processed_filepath)
             new_query = user_query_template.format(modified_key, key, modified_key,f"文件内容：{full_text}")   #转豆包后取消注释
         else:
             new_query = user_query_template.format(modified_key, key, modified_key,"")
@@ -477,7 +475,6 @@ def get_technical_requirements(invalid_path,processed_filepath):
             # 将键中的 '.' 替换为 '下的'
             modified_grouped_key = grouped_key.replace('.', '下的')
             if model_type:
-                full_text = read_txt_to_string(processed_filepath)
                 new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,
                                                            modified_grouped_key, f"文件内容：{full_text}")
             else:
diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py
index 4f0f4a3..a8f35ad 100644
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@@ -1,7 +1,7 @@
 import concurrent.futures
 import json
 import time
-
+from  flask_app.general.clean_pdf import get_pdf_page_count
 from flask_app.general.doubao import pdf2txt
 from flask_app.general.file2markdown import convert_file_to_markdown
 from flask_app.general.format_change import pdf2docx
@@ -27,13 +27,19 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
         return DEFAULT_PROCUREMENT_REQS.copy()
 
     try:
-        processed_filepath = convert_file_to_markdown(procurement_path)   # 转markdown格式
+        if procurement_path==invalid_path:
+            page_count = get_pdf_page_count(procurement_path)
+            model_type=0
+            processed_filepath=""
+        else:
+            model_type = 1
+            processed_filepath = convert_file_to_markdown(procurement_path)   # 转markdown格式
         # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
         # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
         with concurrent.futures.ThreadPoolExecutor() as executor:
 
             # 提交任务给线程池
-            future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath)
+            future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
             time.sleep(0.5)  # 保持原有的延时
             future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
             # 获取并行任务的结果