diff --git a/flask_app/general/判断是否是招标文件.py b/flask_app/general/判断是否是招标文件.py index 02762b0..1b66766 100644 --- a/flask_app/general/判断是否是招标文件.py +++ b/flask_app/general/判断是否是招标文件.py @@ -4,10 +4,10 @@ from flask_app.general.通义千问long import upload_file, qianwen_long def judge_zbfile(pdf_path): - reader = PdfReader(pdf_path) - num_pages = len(reader.pages) - if num_pages <= 5: - return False + # reader = PdfReader(pdf_path) + # num_pages = len(reader.pages) + # if num_pages <= 5: + # return False user_query="""该文件是否属于招标文件?如果是的话,请返回'是',如果不是的话,返回'否'。请不要返回其他解释或内容。 以下是常见的招标文件类型: 公开招标文件、邀请招标文件、竞争性谈判文件、竞争性磋商文件、询价文件、问询文件、货物类招标文件、工程类招标文件、施工类招标文件、服务类招标文件、比选文件。 diff --git a/flask_app/routes/get_deviation.py b/flask_app/routes/get_deviation.py index e84a5ec..f0794d3 100644 --- a/flask_app/routes/get_deviation.py +++ b/flask_app/routes/get_deviation.py @@ -1,6 +1,6 @@ # flask_app/routes/get_deviation.py -from flask import Blueprint, jsonify, Response, g +from flask import Blueprint, Response, g import os from flask_app.general.format_change import download_file from flask_app.routes.偏离表main import get_tech_and_business_deviation diff --git a/flask_app/routes/偏离表main.py b/flask_app/routes/偏离表main.py index 410a1ae..5ae13a4 100644 --- a/flask_app/routes/偏离表main.py +++ b/flask_app/routes/偏离表main.py @@ -551,6 +551,9 @@ def process_functions_in_parallel(tech_deviation_info, busi_requirements_dict, z def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder,zb_type=2): global logger logger = get_global_logger(unique_id) + judge_res = judge_zbfile(file_path) + if not judge_res: + return None # 第一步:根据文件类型进行转换 if file_type == 1: # docx docx_path=file_path @@ -565,9 +568,6 @@ def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder, else: logger.error("不支持的文件类型!") return None - judge_res = judge_zbfile(pdf_path) - if not judge_res: - return None # 第二步:根据zb_type确定选择项和类别,并截取PDF if zb_type == 2: selections = [1, 2, 3, 5] diff --git a/flask_app/routes/小解析main.py b/flask_app/routes/小解析main.py index 5e4c8be..2fa915e 100644 --- a/flask_app/routes/小解析main.py +++ b/flask_app/routes/小解析main.py @@ -99,6 +99,9 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id): """ logger = get_global_logger(unique_id) logger.info("zb_type:"+str(zb_type)) + judge_res = judge_zbfile(file_path) + if not judge_res: + return None # 根据文件类型处理文件路径 if file_type == 1: # docx docx_path = file_path @@ -112,9 +115,6 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id): else: logger.error("Unsupported file type provided. Preprocessing halted.") return None - judge_res = judge_zbfile(pdf_path) - if not judge_res: - return None # 根据招标类型调用相应的解析函数 if zb_type == 2: # 货物标 combined_data = little_parse_goods(output_folder, pdf_path,logger) diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index a0b4552..1047b20 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -28,6 +28,9 @@ def preprocess_files(output_folder, file_path, file_type,logger): logger.info("starting 文件预处理...") logger.info("output_folder..." + output_folder) start_time=time.time() + judge_res = judge_zbfile(file_path) + if not judge_res: + return None # 根据文件类型处理文件路径 if file_type == 1: # docx # docx_path = file_path @@ -41,9 +44,6 @@ def preprocess_files(output_folder, file_path, file_type,logger): else: logger.error("Unsupported file type provided. Preprocessing halted.") return None - judge_res = judge_zbfile(pdf_path) - if not judge_res: - return None # 调用截取PDF多次 truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering') print("切割出的文件:"+str(truncate_files)) diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index 606fdf9..445bd7e 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -24,6 +24,9 @@ executor = ThreadPoolExecutor() def preprocess_files(output_folder, file_path, file_type,logger): logger.info("starting 文件预处理...") start_time = time.time() + judge_res = judge_zbfile(file_path) + if not judge_res: + return None logger.info("output_folder..." + output_folder) # 根据文件类型处理文件路径 if file_type == 1: # docx @@ -39,9 +42,6 @@ def preprocess_files(output_folder, file_path, file_type,logger): logger.error("Unsupported file type provided. Preprocessing halted.") return None - judge_res=judge_zbfile(pdf_path) - if not judge_res: - return None # 调用截取PDF多次 truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文 diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index 0eab7e0..f9eeede 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -68,7 +68,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin # 定义基础的 mid_pattern base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \ r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \ - r'|(?