12.19 修复豆包模型使用的bug
This commit is contained in:
commit
fd45d78c10
@ -1,5 +1,9 @@
|
||||
import re
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
from flask_app.general.format_change import docx2pdf
|
||||
|
||||
|
||||
def extract_common_header(pdf_path):
|
||||
|
||||
def get_headers(pdf_document, start_page, pages_to_read):
|
||||
@ -105,7 +109,19 @@ def is_scanned_pdf(file_path, max_pages=15):
|
||||
return False # 不是扫描型
|
||||
return True # 前 max_pages 页都没有文本
|
||||
|
||||
|
||||
def get_pdf_page_count(file_path):
|
||||
"""
|
||||
获取 PDF 文件的页码数量
|
||||
"""
|
||||
try:
|
||||
pdf_path=file_path
|
||||
if file_path.lower().endswith(('.doc', '.docx')):
|
||||
pdf_path = docx2pdf(file_path)
|
||||
reader = PdfReader(pdf_path)
|
||||
return len(reader.pages)
|
||||
except Exception as e:
|
||||
print(f"读取 PDF 页码时出错:{e}")
|
||||
return 0
|
||||
if __name__ == '__main__':
|
||||
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||
res=is_scanned_pdf(file_path)
|
||||
|
@ -1,7 +1,7 @@
|
||||
import concurrent.futures
|
||||
import json
|
||||
import time
|
||||
|
||||
from flask_app.general.clean_pdf import get_pdf_page_count
|
||||
from flask_app.general.doubao import pdf2txt
|
||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
@ -28,12 +28,24 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
|
||||
return DEFAULT_PROCUREMENT_REQS.copy()
|
||||
|
||||
try:
|
||||
processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式
|
||||
if procurement_path == invalid_path:
|
||||
# 读取 PDF 页码数
|
||||
page_count = get_pdf_page_count(procurement_path)
|
||||
|
||||
if page_count > 80: # 如果页码数大于 50
|
||||
model_type = 0
|
||||
processed_filepath = ""
|
||||
else:
|
||||
model_type = 1
|
||||
processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式
|
||||
else:
|
||||
model_type = 1
|
||||
processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式
|
||||
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
|
||||
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# 提交任务给线程池
|
||||
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath)
|
||||
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
|
||||
time.sleep(0.5) # 保持原有的延时
|
||||
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
|
||||
# 获取并行任务的结果
|
||||
|
Loading…
x
Reference in New Issue
Block a user