12.19 修复豆包模型使用的bug
This commit is contained in:
parent
0da44d31a1
commit
f5261087b4
@ -1,5 +1,9 @@
|
|||||||
import re
|
import re
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
|
from flask_app.general.format_change import docx2pdf
|
||||||
|
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
|
|
||||||
def get_headers(pdf_document, start_page, pages_to_read):
|
def get_headers(pdf_document, start_page, pages_to_read):
|
||||||
@ -105,7 +109,19 @@ def is_scanned_pdf(file_path, max_pages=15):
|
|||||||
return False # 不是扫描型
|
return False # 不是扫描型
|
||||||
return True # 前 max_pages 页都没有文本
|
return True # 前 max_pages 页都没有文本
|
||||||
|
|
||||||
|
def get_pdf_page_count(file_path):
|
||||||
|
"""
|
||||||
|
获取 PDF 文件的页码数量
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pdf_path=file_path
|
||||||
|
if file_path.lower().endswith(('.doc', '.docx')):
|
||||||
|
pdf_path = docx2pdf(file_path)
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
return len(reader.pages)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取 PDF 页码时出错:{e}")
|
||||||
|
return 0
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||||
res=is_scanned_pdf(file_path)
|
res=is_scanned_pdf(file_path)
|
||||||
|
@ -173,16 +173,20 @@ def read_txt_to_string(file_path):
|
|||||||
- file_path (str): txt文件的路径
|
- file_path (str): txt文件的路径
|
||||||
|
|
||||||
返回:
|
返回:
|
||||||
- str: 包含文件内容的字符串
|
- str: 包含文件内容的字符串,如果出错返回空字符串 ""
|
||||||
"""
|
"""
|
||||||
|
if not file_path: # 检查文件路径是否为空
|
||||||
|
return ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(file_path, 'r', encoding='utf-8') as file: # 确保使用适当的编码
|
with open(file_path, 'r', encoding='utf-8') as file: # 确保使用适当的编码
|
||||||
content = file.read() # 使用 read() 保持文件格式
|
content = file.read() # 使用 read() 保持文件格式
|
||||||
return content
|
return content
|
||||||
except FileNotFoundError:
|
except (FileNotFoundError, IOError): # 捕获文件未找到或I/O错误
|
||||||
return "错误:文件未找到。"
|
return ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"错误:读取文件时发生错误。详细信息:{e}"
|
# 可根据需要记录错误日志,但返回值仍是空字符串
|
||||||
|
return ""
|
||||||
|
|
||||||
@sleep_and_retry
|
@sleep_and_retry
|
||||||
@limits(calls=10, period=1) # 每秒最多调用10次
|
@limits(calls=10, period=1) # 每秒最多调用10次
|
||||||
|
@ -359,27 +359,26 @@ def generate_prompt(judge_res, full_text=None):
|
|||||||
return base_prompt
|
return base_prompt
|
||||||
|
|
||||||
#文件内容以markdown格式组织,其中表格部分(若有)以html语法组织,
|
#文件内容以markdown格式组织,其中表格部分(若有)以html语法组织,
|
||||||
def get_technical_requirements(invalid_path,processed_filepath):
|
def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||||
|
judge_res=""
|
||||||
file_id = ""
|
file_id = ""
|
||||||
model_type = 1 # 默认使用豆包
|
full_text = read_txt_to_string(processed_filepath)
|
||||||
first_query_template="""该文件是否说明了采购需求,即需要采购哪些内容(包括货物、设备、系统、功能模块等)?如果有,请回答'是',否则,回答'否'
|
if model_type:
|
||||||
文件内容:
|
first_query_template = """该文件是否说明了采购需求,即需要采购哪些内容(包括货物、设备、系统、功能模块等)?如果有,请回答'是',否则,回答'否'
|
||||||
{full_text}
|
{}
|
||||||
"""
|
"""
|
||||||
judge_query = generate_full_user_query(processed_filepath, first_query_template)
|
judge_query = first_query_template.format(f"文件内容:{full_text}")
|
||||||
# print(judge_query)
|
judge_res = doubao_model(judge_query)
|
||||||
judge_res = doubao_model(judge_query)
|
if '否' in judge_res or model_type==0:
|
||||||
if '否' in judge_res:
|
|
||||||
model_type = 0 # 使用qianwen-long+invalid_path
|
model_type = 0 # 使用qianwen-long+invalid_path
|
||||||
print("no!调用invalid_path")
|
print("no!调用invalid_path")
|
||||||
if invalid_path.lower().endswith('.pdf'): #确保上传的是docx
|
if invalid_path.lower().endswith('.pdf'): #确保上传的是docx
|
||||||
invalid_path = pdf2docx(invalid_path)
|
invalid_path = pdf2docx(invalid_path)
|
||||||
file_id=upload_file(invalid_path)
|
file_id = upload_file(invalid_path)
|
||||||
user_query = generate_prompt(judge_res)
|
user_query = generate_prompt(judge_res)
|
||||||
model_res=qianwen_long(file_id,user_query)
|
model_res=qianwen_long(file_id,user_query)
|
||||||
print(model_res)
|
print(model_res)
|
||||||
else:
|
else:
|
||||||
full_text = read_txt_to_string(processed_filepath)
|
|
||||||
user_query=generate_prompt(judge_res,full_text)
|
user_query=generate_prompt(judge_res,full_text)
|
||||||
model_res=doubao_model(user_query)
|
model_res=doubao_model(user_query)
|
||||||
print(model_res)
|
print(model_res)
|
||||||
@ -465,7 +464,6 @@ def get_technical_requirements(invalid_path,processed_filepath):
|
|||||||
modified_key = key.replace('.', '下的')
|
modified_key = key.replace('.', '下的')
|
||||||
# 使用修改后的键填充第一个占位符,原始键填充第二个占位符
|
# 使用修改后的键填充第一个占位符,原始键填充第二个占位符
|
||||||
if model_type:
|
if model_type:
|
||||||
full_text = read_txt_to_string(processed_filepath)
|
|
||||||
new_query = user_query_template.format(modified_key, key, modified_key,f"文件内容:{full_text}") #转豆包后取消注释
|
new_query = user_query_template.format(modified_key, key, modified_key,f"文件内容:{full_text}") #转豆包后取消注释
|
||||||
else:
|
else:
|
||||||
new_query = user_query_template.format(modified_key, key, modified_key,"")
|
new_query = user_query_template.format(modified_key, key, modified_key,"")
|
||||||
@ -477,7 +475,6 @@ def get_technical_requirements(invalid_path,processed_filepath):
|
|||||||
# 将键中的 '.' 替换为 '下的'
|
# 将键中的 '.' 替换为 '下的'
|
||||||
modified_grouped_key = grouped_key.replace('.', '下的')
|
modified_grouped_key = grouped_key.replace('.', '下的')
|
||||||
if model_type:
|
if model_type:
|
||||||
full_text = read_txt_to_string(processed_filepath)
|
|
||||||
new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,
|
new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,
|
||||||
modified_grouped_key, f"文件内容:{full_text}")
|
modified_grouped_key, f"文件内容:{full_text}")
|
||||||
else:
|
else:
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
from flask_app.general.clean_pdf import get_pdf_page_count
|
||||||
from flask_app.general.doubao import pdf2txt
|
from flask_app.general.doubao import pdf2txt
|
||||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||||
from flask_app.general.format_change import pdf2docx
|
from flask_app.general.format_change import pdf2docx
|
||||||
@ -27,13 +27,19 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
|
|||||||
return DEFAULT_PROCUREMENT_REQS.copy()
|
return DEFAULT_PROCUREMENT_REQS.copy()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式
|
if procurement_path==invalid_path:
|
||||||
|
page_count = get_pdf_page_count(procurement_path)
|
||||||
|
model_type=0
|
||||||
|
processed_filepath=""
|
||||||
|
else:
|
||||||
|
model_type = 1
|
||||||
|
processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式
|
||||||
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
|
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
|
||||||
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
|
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
|
||||||
# 提交任务给线程池
|
# 提交任务给线程池
|
||||||
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath)
|
future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
|
||||||
time.sleep(0.5) # 保持原有的延时
|
time.sleep(0.5) # 保持原有的延时
|
||||||
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
|
future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
|
||||||
# 获取并行任务的结果
|
# 获取并行任务的结果
|
||||||
|
Loading…
x
Reference in New Issue
Block a user