diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py index 0e40bf2..d55dc6d 100644 --- a/flask_app/general/clean_pdf.py +++ b/flask_app/general/clean_pdf.py @@ -1,5 +1,9 @@ import re from PyPDF2 import PdfReader + +from flask_app.general.format_change import docx2pdf + + def extract_common_header(pdf_path): def get_headers(pdf_document, start_page, pages_to_read): @@ -105,7 +109,19 @@ def is_scanned_pdf(file_path, max_pages=15): return False # 不是扫描型 return True # 前 max_pages 页都没有文本 - +def get_pdf_page_count(file_path): + """ + 获取 PDF 文件的页码数量 + """ + try: + pdf_path=file_path + if file_path.lower().endswith(('.doc', '.docx')): + pdf_path = docx2pdf(file_path) + reader = PdfReader(pdf_path) + return len(reader.pages) + except Exception as e: + print(f"读取 PDF 页码时出错:{e}") + return 0 if __name__ == '__main__': file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf" res=is_scanned_pdf(file_path) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 536c5d3..3d60e50 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -1,7 +1,7 @@ import concurrent.futures import json import time - +from flask_app.general.clean_pdf import get_pdf_page_count from flask_app.general.doubao import pdf2txt from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.format_change import pdf2docx @@ -28,12 +28,24 @@ def fetch_procurement_reqs(procurement_path, invalid_path): return DEFAULT_PROCUREMENT_REQS.copy() try: - processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 + if procurement_path == invalid_path: + # 读取 PDF 页码数 + page_count = get_pdf_page_count(procurement_path) + + if page_count > 80: # 如果页码数大于 50 + model_type = 0 + processed_filepath = "" + else: + model_type = 1 + processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 + else: + model_type = 1 + processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 # processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: # 提交任务给线程池 - future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath) + future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type) time.sleep(0.5) # 保持原有的延时 future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath) # 获取并行任务的结果