2.18 空的docx无法被识别为非招标文件
This commit is contained in:
parent
518f5d7ac7
commit
b25592da4d
@ -160,8 +160,7 @@ def generate_full_user_query(file_path, prompt_template):
|
||||
return user_query
|
||||
|
||||
if __name__ == "__main__":
|
||||
error_msg='''Error code: 400 - {'error': {'code': 'RequestTimeOut', 'param': None, 'message': 'Request timed out, please try again later.', 'type': 'RequestTimeOut'}, 'id': 'chatcmpl-a92698d3-4e21-9243-b900-e32d4df4ac49', 'request_id': 'a92698d3-4e21-9243-b900-e32d4df4ac49'}
|
||||
'''
|
||||
error_msg='''openai.BadRequestError: Error code: 400 - {'error': {'code': 'invalid_parameter_error', 'param': None, 'message': 'File parsing in progress, please try again later.', 'type': 'invalid_request_error'}, 'id': 'chatcmpl-018e2b26-b3eb-907e-be6a-6ba8fccee86f', 'request_id': '018e2b26-b3eb-907e-be6a-6ba8fccee86f'}'''
|
||||
error_code, error_code_string, request_id=extract_error_details(error_msg)
|
||||
print(error_code)
|
||||
print(error_code_string)
|
||||
|
@ -99,7 +99,7 @@ def qianwen_long(file_id, user_query, max_retries=2, backoff_factor=1.0, need_ex
|
||||
break
|
||||
elif error_code == 400 and error_code_string in [
|
||||
'data_inspection_failed', 'ResponseTimeout', 'DataInspectionFailed',
|
||||
'response_timeout', 'request_timeout', "RequestTimeOut"
|
||||
'response_timeout', 'request_timeout', "RequestTimeOut","invalid_parameter_error"
|
||||
]:
|
||||
logger.warning(f"错误代码为 400 - {error_code_string},将调用 qianwen_long_stream 执行一次...")
|
||||
try:
|
||||
@ -219,7 +219,7 @@ def qianwen_long_stream(file_id, user_query, max_retries=2, backoff_factor=1.0,
|
||||
break
|
||||
elif error_code == 400 and error_code_string in [
|
||||
'data_inspection_failed', 'ResponseTimeout', 'DataInspectionFailed',
|
||||
'response_timeout', 'request_timeout', "RequestTimeOut"
|
||||
'response_timeout', 'request_timeout', "RequestTimeOut","invalid_parameter_error"
|
||||
]:
|
||||
if attempt == 1: # 只重试一次
|
||||
logger.warning(f"错误代码为 400 - {error_code_string},将立即重试...")
|
||||
|
@ -200,8 +200,6 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
|
||||
if matching_files:
|
||||
matching_files_sorted = sorted(matching_files)
|
||||
all_pdfs_to_merge.extend(matching_files_sorted)
|
||||
for f in matching_files_sorted:
|
||||
print(f"选中文件: {f}")
|
||||
else:
|
||||
if required:
|
||||
print(f"没有找到以 '{suffix}' 结尾的文件。")
|
||||
|
@ -308,7 +308,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
|
||||
}
|
||||
""",
|
||||
3: """
|
||||
该招标文件中重新招标(或重新采购)、不再招标(或不再采购)、终止招标(或终止采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。
|
||||
该招标文件中重新招标(或重新采购)、不再招标(或不再采购)、终止招标(或终止采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该基于实际的招标文件内容,不得擅自总结删减,更不得回答示例输出中的内容,如果原文中未提及相关内容,在相应键值中填'未知'。
|
||||
示例输出如下,仅供格式参考:
|
||||
{
|
||||
"重新招标":"有下列情形之一的,招标人将重新招标:(1)投标截止时间止,投标人少于3个的;(2)经评标委员会评审后否决所有投标的;",
|
||||
|
@ -66,10 +66,10 @@ def create_logger(app, subfolder):
|
||||
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
file_handler.setFormatter(file_formatter)
|
||||
logger.addHandler(file_handler)
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(logging.Formatter('%(message)s'))
|
||||
logger.addHandler(stream_handler)
|
||||
logger.setLevel(logging.INFO)
|
||||
# stream_handler = logging.StreamHandler()
|
||||
# stream_handler.setFormatter(logging.Formatter('%(message)s'))
|
||||
# logger.addHandler(stream_handler)
|
||||
logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL),而 DEBUG 级别的消息会被忽略。
|
||||
logger.propagate = False
|
||||
g.logger = logger
|
||||
g.output_folder = output_folder #输出文件夹路径
|
||||
|
@ -20,6 +20,7 @@ def process_pdf(pdf_path):
|
||||
"""子进程里实际执行的函数"""
|
||||
result = read_pdf_main(pdf_path=pdf_path)
|
||||
return result
|
||||
|
||||
def main():
|
||||
for i in range(3):
|
||||
logger = get_global_logger("123")
|
||||
|
30
flask_app/test_case/test_内存泄漏2.py
Normal file
30
flask_app/test_case/test_内存泄漏2.py
Normal file
@ -0,0 +1,30 @@
|
||||
import tracemalloc
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
def extract_text_by_page(file_path):
|
||||
result = ""
|
||||
with open(file_path, 'rb') as file:
|
||||
reader =PdfReader(file)
|
||||
num_pages = len(reader.pages)
|
||||
# print(f"Total pages: {num_pages}")
|
||||
for page_num in range(num_pages):
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text()
|
||||
return result
|
||||
|
||||
# 开始跟踪内存分配
|
||||
tracemalloc.start()
|
||||
# 捕捉函数调用前的内存快照
|
||||
snapshot_before = tracemalloc.take_snapshot()
|
||||
# 调用函数
|
||||
file_path=r'C:\Users\Administrator\Desktop\fsdownload\00550cfc-fd33-469e-8272-9215291b175c\ztbfile.pdf'
|
||||
result = extract_text_by_page(file_path)
|
||||
# 捕捉函数调用后的内存快照
|
||||
snapshot_after = tracemalloc.take_snapshot()
|
||||
# 比较两个快照,获取内存分配差异信息
|
||||
stats = snapshot_after.compare_to(snapshot_before, 'lineno')
|
||||
print("[ Top 10 内存变化 ]")
|
||||
for stat in stats[:10]:
|
||||
print(stat)
|
||||
# 停止内存分配跟踪
|
||||
tracemalloc.stop()
|
Loading…
x
Reference in New Issue
Block a user