2.18 空的docx无法被识别为非招标文件

This commit is contained in:
zy123 2025-02-18 15:45:07 +08:00
parent 518f5d7ac7
commit b25592da4d
7 changed files with 39 additions and 11 deletions

View File

@ -160,8 +160,7 @@ def generate_full_user_query(file_path, prompt_template):
return user_query
if __name__ == "__main__":
error_msg='''Error code: 400 - {'error': {'code': 'RequestTimeOut', 'param': None, 'message': 'Request timed out, please try again later.', 'type': 'RequestTimeOut'}, 'id': 'chatcmpl-a92698d3-4e21-9243-b900-e32d4df4ac49', 'request_id': 'a92698d3-4e21-9243-b900-e32d4df4ac49'}
'''
error_msg='''openai.BadRequestError: Error code: 400 - {'error': {'code': 'invalid_parameter_error', 'param': None, 'message': 'File parsing in progress, please try again later.', 'type': 'invalid_request_error'}, 'id': 'chatcmpl-018e2b26-b3eb-907e-be6a-6ba8fccee86f', 'request_id': '018e2b26-b3eb-907e-be6a-6ba8fccee86f'}'''
error_code, error_code_string, request_id=extract_error_details(error_msg)
print(error_code)
print(error_code_string)

View File

@ -99,7 +99,7 @@ def qianwen_long(file_id, user_query, max_retries=2, backoff_factor=1.0, need_ex
break
elif error_code == 400 and error_code_string in [
'data_inspection_failed', 'ResponseTimeout', 'DataInspectionFailed',
'response_timeout', 'request_timeout', "RequestTimeOut"
'response_timeout', 'request_timeout', "RequestTimeOut","invalid_parameter_error"
]:
logger.warning(f"错误代码为 400 - {error_code_string},将调用 qianwen_long_stream 执行一次...")
try:
@ -219,7 +219,7 @@ def qianwen_long_stream(file_id, user_query, max_retries=2, backoff_factor=1.0,
break
elif error_code == 400 and error_code_string in [
'data_inspection_failed', 'ResponseTimeout', 'DataInspectionFailed',
'response_timeout', 'request_timeout', "RequestTimeOut"
'response_timeout', 'request_timeout', "RequestTimeOut","invalid_parameter_error"
]:
if attempt == 1: # 只重试一次
logger.warning(f"错误代码为 400 - {error_code_string},将立即重试...")

View File

@ -200,8 +200,6 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
if matching_files:
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
if required:
print(f"没有找到以 '{suffix}' 结尾的文件。")

View File

@ -308,7 +308,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
}
""",
3: """
该招标文件中重新招标或重新采购不再招标或不再采购终止招标或终止采购的情况分别是什么请以json格式返回给我结果键名分别为'重新招标''不再招标''终止招标'键值应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容键值中填'未知'
该招标文件中重新招标或重新采购不再招标或不再采购终止招标或终止采购的情况分别是什么请以json格式返回给我结果键名分别为'重新招标''不再招标''终止招标'键值应该基于实际的招标文件内容不得擅自总结删减更不得回答示例输出中的内容如果原文中未提及相关内容相应键值中填'未知'
示例输出如下仅供格式参考
{
"重新招标":"有下列情形之一的招标人将重新招标1投标截止时间止投标人少于3个的2经评标委员会评审后否决所有投标的",

View File

@ -66,10 +66,10 @@ def create_logger(app, subfolder):
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter('%(message)s'))
logger.addHandler(stream_handler)
logger.setLevel(logging.INFO)
# stream_handler = logging.StreamHandler()
# stream_handler.setFormatter(logging.Formatter('%(message)s'))
# logger.addHandler(stream_handler)
logger.setLevel(logging.INFO) #Logger 只会处理大于或等于 INFO 级别的日志消息(例如 INFO、WARNING、ERROR、CRITICAL而 DEBUG 级别的消息会被忽略。
logger.propagate = False
g.logger = logger
g.output_folder = output_folder #输出文件夹路径

View File

@ -20,6 +20,7 @@ def process_pdf(pdf_path):
"""子进程里实际执行的函数"""
result = read_pdf_main(pdf_path=pdf_path)
return result
def main():
for i in range(3):
logger = get_global_logger("123")

View File

@ -0,0 +1,30 @@
import tracemalloc
from PyPDF2 import PdfReader
def extract_text_by_page(file_path):
result = ""
with open(file_path, 'rb') as file:
reader =PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
return result
# 开始跟踪内存分配
tracemalloc.start()
# 捕捉函数调用前的内存快照
snapshot_before = tracemalloc.take_snapshot()
# 调用函数
file_path=r'C:\Users\Administrator\Desktop\fsdownload\00550cfc-fd33-469e-8272-9215291b175c\ztbfile.pdf'
result = extract_text_by_page(file_path)
# 捕捉函数调用后的内存快照
snapshot_after = tracemalloc.take_snapshot()
# 比较两个快照,获取内存分配差异信息
stats = snapshot_after.compare_to(snapshot_before, 'lineno')
print("[ Top 10 内存变化 ]")
for stat in stats[:10]:
print(stat)
# 停止内存分配跟踪
tracemalloc.stop()