From adc580a5f1fa11ef198d0b616977679294d9405f Mon Sep 17 00:00:00 2001 From: zxc <571036709@qq.com> Date: Wed, 5 Mar 2025 16:09:07 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=8B=9B=E6=A0=87=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=88=A4=E6=96=AD&=E6=96=B0=E5=A2=9E=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E9=87=87=E8=B4=AD=E9=99=84=E4=BB=B6=E5=8F=82=E6=95=B0?= =?UTF-8?q?&=E4=BC=98=E5=8C=96=E6=8B=9B=E6=A0=87=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E9=AA=8C=E8=AF=81=E5=88=87=E5=88=86=E6=AD=A3=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/merge_pdfs.py | 44 ++++++++++++++++++- flask_app/general/截取pdf通用函数.py | 5 ++- flask_app/routes/upload.py | 18 +++++--- flask_app/routes/utils.py | 6 ++- flask_app/routes/判断是否是招标文件.py | 61 ++++++++++++++++++++++++-- flask_app/routes/工程标解析main.py | 6 +-- flask_app/routes/货物标解析main.py | 51 +++++++++++++++++---- 7 files changed, 164 insertions(+), 27 deletions(-) diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index b103b29..99f3868 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -1,8 +1,50 @@ import os import fitz -from PyPDF2 import PdfReader, PdfWriter +import tempfile +import logging +from PyPDF2 import PdfReader, PdfWriter, PdfMerger from flask_app.general.读取文件.clean_pdf import create_get_text_function + +def merge_pdf_to(original_pdf, to_pdf, logger) -> str: + """ merge original pdf to exists pdf """ + merger = None + temp_filename = None + try: + # 初始化PdfMerger + merger = PdfMerger() + + # 读取目标PDF的内容 + with open(to_pdf, 'rb') as f: + merger.append(f) + + # 读取原始PDF的内容 + with open(original_pdf, 'rb') as f: + merger.append(f) + + # 创建临时文件 + with tempfile.NamedTemporaryFile(dir=os.path.dirname(to_pdf), suffix='.pdf', delete=False) as temp_file: + temp_filename = temp_file.name + merger.write(temp_file) + + # 用临时文件替换原文件 + os.replace(temp_filename, to_pdf) + return to_pdf + except Exception as e: + logger.error(f"merge_pdf_to 遇到错误: {str(e)}") + # 发生错误时删除临时文件 + if temp_filename and os.path.exists(temp_filename): + try: + os.remove(temp_filename) + except: + pass + return to_pdf + finally: + # 确保merger和临时文件被正确关闭/清理 + if merger is not None: + merger.close() + + #合并PDF # 主合并函数,尝试使用 PyPDF2,若失败则使用 fitz def merge_pdfs(paths, output_path): diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 3d0aeb6..1415009 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -319,8 +319,11 @@ def get_invalid_file(pdf_path, output_folder, common_header, begin_page): if regex.search(exclusion_pat, text): continue if regex.search(pattern, text): + end_exclusion_pat = regex.compile(r'第[一二三四五六七八九十百千]+(?:章|部分)\s*“[^“”]*合同[^“”]*”', regex.DOTALL) + if regex.search(end_exclusion_pat, text): + continue # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") - return i + return i else: # 当页数较少时,从后向前查找(至少保留前30页) for pattern in end_patterns: diff --git a/flask_app/routes/upload.py b/flask_app/routes/upload.py index 0a0124c..c1453f8 100644 --- a/flask_app/routes/upload.py +++ b/flask_app/routes/upload.py @@ -15,20 +15,20 @@ from flask_app.ConnectionLimiter import require_connection_limit, require_execut upload_bp = Blueprint('upload', __name__) -def _child_target(main_func, queue, output_folder, file_path, file_type, unique_id): +def _child_target(main_func, queue, output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id): """ 子进程中调用 `main_func`(它是一个生成器函数), 将其 yield 出的数据逐条放进队列,最后放一个 None 表示结束。 """ try: - for data in main_func(output_folder, file_path, file_type, unique_id): + for data in main_func(output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id): queue.put(data) except Exception as e: # 如果要把异常也传给父进程,以便父进程可感知 queue.put(json.dumps({'error': str(e)}, ensure_ascii=False)) finally: queue.put(None) -def run_in_subprocess(main_func, output_folder, file_path, file_type, unique_id): +def run_in_subprocess(main_func, output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id): """ 启动子进程调用 `main_func(...)`,并在父进程流式获取其输出(通过 Queue)。 子进程结束时,操作系统回收其内存;父进程则保持实时输出。 @@ -36,7 +36,7 @@ def run_in_subprocess(main_func, output_folder, file_path, file_type, unique_id) queue = multiprocessing.Queue() p = multiprocessing.Process( target=_child_target, - args=(main_func, queue, output_folder, file_path, file_type, unique_id) + args=(main_func, queue, output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id) ) p.start() @@ -59,11 +59,12 @@ def zbparse(): #大解析 logger.info("Received JSON data: " + str(received_data)) file_url = g.file_url zb_type = g.zb_type + require_file_url = g.require_file_url file_name = urllib.parse.unquote(os.path.basename(file_url.split('?')[0])) logger.info(f"Starting parsing file: {file_name}") try: logger.info("starting parsing url:" + file_url) - return process_and_stream(file_url, zb_type) #主要执行函数 + return process_and_stream(file_url, zb_type, require_file_url) #主要执行函数 except Exception as e: logger.error('Exception occurred: ' + str(e)) if hasattr(g, 'unique_id'): @@ -84,7 +85,7 @@ def zbparse(): #大解析 data='Internal server error' ) return jsonify(error_response) -def process_and_stream(file_url, zb_type): +def process_and_stream(file_url, zb_type, require_file_url): """ 下载文件并进行处理,支持工程标和货物标的处理。 """ @@ -92,11 +93,14 @@ def process_and_stream(file_url, zb_type): unique_id = g.unique_id output_folder = g.output_folder filename = "ztbfile" + require_filename = "ztbfile_require" downloaded_filename = os.path.join(output_folder, filename) + require_downloaded_filename = os.path.join(output_folder, require_filename) start_time = time.time() file_name = urllib.parse.unquote(os.path.basename(file_url.split('?')[0])) try: downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True) + require_downloaded_filepath, require_file_type = ("", -1) if not require_file_url else download_file(require_file_url, require_downloaded_filename, True) if not downloaded_filepath or file_type == 4: logger.error("下载文件失败或不支持的文件类型") log_error_unique_id(unique_id, 1, file_name=file_name) @@ -119,7 +123,7 @@ def process_and_stream(file_url, zb_type): } processing_func = processing_functions.get(zb_type, goods_bid_main) - for data in run_in_subprocess(processing_func, output_folder, downloaded_filepath, file_type, unique_id): #逐一接收货物标 工程标解析内容,为前端网页展示服务 + for data in run_in_subprocess(processing_func, output_folder, downloaded_filepath, file_type, require_downloaded_filepath, require_file_type, unique_id): #逐一接收货物标 工程标解析内容,为前端网页展示服务 if not data.strip(): logger.error("Received empty data, skipping JSON parsing.") continue diff --git a/flask_app/routes/utils.py b/flask_app/routes/utils.py index b5a5307..9240714 100644 --- a/flask_app/routes/utils.py +++ b/flask_app/routes/utils.py @@ -16,13 +16,14 @@ def validate_request(): return jsonify({'error': 'Missing JSON in request'}), 400 file_url = request.json.get('file_url') zb_type = request.json.get('zb_type', 2) #zb_type:默认按货物标解析 + require_file_url = request.json.get('require_file_url', "") if not file_url: return jsonify({'error': 'No file URL provided'}), 400 try: zb_type = int(zb_type) except (ValueError, TypeError): return jsonify({'error': 'Invalid zb_type provided'}), 400 - return file_url, zb_type + return file_url, zb_type, require_file_url def generate_deviation_response(tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation,proof_materials, logger): logger.info(f"技术偏离表: {json.dumps(tech_deviation, ensure_ascii=False, indent=4)}") @@ -105,7 +106,7 @@ def validate_and_setup_logger(f): # 进行请求验证 validation_result = validate_request() if isinstance(validation_result, tuple): - file_url, zb_type = validation_result + file_url, zb_type, require_file_url = validation_result # 根据蓝图确定子文件夹 blueprint = request.blueprint @@ -125,6 +126,7 @@ def validate_and_setup_logger(f): # 将验证后的数据存储在 g 对象中 g.file_url = file_url g.zb_type = zb_type + g.require_file_url = require_file_url return f(*args, **kwargs) else: # 验证失败,返回错误响应 diff --git a/flask_app/routes/判断是否是招标文件.py b/flask_app/routes/判断是否是招标文件.py index ef1aec6..b4ce2c9 100644 --- a/flask_app/routes/判断是否是招标文件.py +++ b/flask_app/routes/判断是否是招标文件.py @@ -2,30 +2,76 @@ import time from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2 from docx import Document import fitz +from flask_app.general.llm.qianwen_plus import qianwen_plus from flask_app.general.llm.通义千问long import upload_file, qianwen_long -def judge_zbfile_exec(file_path): + +def pre_judge(few_pages_content: str): + if few_pages_content.strip() == "": + return "未知" + + # 使用大模型进行判断 + user_query = f""" + 一, 任务描述: + 我会给定一段从文件中摘取出来的文本, 你需要根据该文本的内容来判断文件是否属于招标文件。 + + 二, 判断准则: + 以下是常见的招标文件类型: + 公开招标文件、邀请招标文件、竞争性谈判文件、竞争性磋商文件、询价文件、问询文件、货物类招标文件、工程类招标文件、施工类招标文件、服务类招标文件、比选文件。 + 若有未涵盖的类型,但其内容明确表达了项目需求、采购或招标信息,且包含指导投标人参与的关键要素,则可视为招标文件。 + 排除情况: + 1. 请注意区分招标文件和投标文件,若文件仅有投标文件格式要求部分,或是投标、响应性文件,则不视为招标文件。 + 2. 若文件内容为乱码,无有效信息,请直接返回"否"。 + + 三, 注意事项: 你并不是完全依赖于判断准则来给出判断, 而是根据自己对招标文件的理解来判断, 给定判断准则只是帮助你进行理解。 + + 四, 回答格式: 你的回答只能为"是","否","未知"三种情况之一, 不要在回答中进行任何的解释或输出其他不属于"是","否","未知"的字符。 + + 五, 给定从文件中摘取出来的文本: + {few_pages_content} + + 提问: 该内容是否属于招标文件? 如果属于返回"是"; 如果不属于则返回"否"; 如果你无法判断则返回"未知", 不要进行假设性判断, 不理解就返回"未知"。 + """ + start_time = time.time() + model_res = qianwen_plus(user_query) + end_time = time.time() + print(f"pre_judge 判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒") + + if "是" in model_res: + return "是" + elif "否" in model_res: + return "否" + else: + return "未知" + + +def judge_zbfile_exec(file_path, end_page=5, max_len=3000): """ 判断文件是否属于招标文件,并返回结果。 """ try: start_time = time.time() + few_pages_content = "" # 检查文件是否为PDF格式 if file_path.lower().endswith('.pdf'): try: with open(file_path, 'rb') as f: reader = PdfReader(f) num_pages = len(reader.pages) + if num_pages > end_page: + few_pages_content = "".join([reader.pages[i].extract_text() for i in range(end_page)]) except Exception: try: doc = fitz.open(file_path) num_pages = len(doc) + if num_pages > end_page: + few_pages_content = "".join([doc.load_page(i).get_text() for i in range(end_page)]) except Exception: print("PDF 文件读取失败") return False # 两种解析方式都失败,直接返回 False - if num_pages <= 5: return False # 小于等于 5 页的 PDF 直接判定为非招标文件 + few_pages_content = few_pages_content[:max_len] elif file_path.lower().endswith('.docx'): try: doc = Document(file_path) @@ -37,13 +83,20 @@ def judge_zbfile_exec(file_path): chunk = paragraphs[i:i + chunk_size] for para in chunk: accumulated_text += para.text - if len(accumulated_text) >= 1000: + if len(accumulated_text) >= max_len: break # 读取超过1000字后即可停止 if len(accumulated_text) < 1000: return False # 若累计内容不足1000字,则直接返回 False except Exception: print("DOCX 文件读取失败,可能为乱码文件") return False # 解析失败直接返回 False + few_pages_content = accumulated_text[:max_len] + + # pre_judge -> 是, 否, 未知 + pre_res = pre_judge(few_pages_content) + if pre_res != "未知": + return '否' not in pre_res + pre_endtime=time.time() print(f"judge_zbfile_exec预处理耗时:{pre_endtime - start_time:.2f} 秒") # 使用大模型进行判断 @@ -60,7 +113,7 @@ def judge_zbfile_exec(file_path): model_res = qianwen_long(file_id, user_query) end_time = time.time() print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒") - print(f"判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒") + print(f"qianwen_long 判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒") return '否' not in model_res diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index 944cabf..56303b7 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -33,7 +33,7 @@ def engineering_bid_main_process(output_folder, file_path, file_type, unique_id) return result -def preprocess_files(output_folder, file_path, file_type, logger): +def preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger): logger.info("starting 文件预处理...") start_time = time.time() is_pure_image_flag = False @@ -239,10 +239,10 @@ def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_pa return result -def engineering_bid_main(output_folder, file_path, file_type, unique_id): +def engineering_bid_main(output_folder, file_path, file_type, require_file_path, require_file_type, unique_id): logger = get_global_logger(unique_id) # 预处理文件,获取处理后的数据 - processed_data = preprocess_files(output_folder, file_path, file_type, logger) + processed_data = preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger) if not processed_data: error_response = { 'error': '文件预处理失败。请检查文件类型并重试。' diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index dab60f9..9ba3ee1 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -15,17 +15,18 @@ from flask_app.general.无效标和废标公共代码 import combine_find_invali from flask_app.货物标.资格审查main import combine_qualification_review from flask_app.general.商务技术评分提取 import combine_evaluation_standards from concurrent.futures import ThreadPoolExecutor +from flask_app.general.merge_pdfs import merge_pdf_to, merge_pdfs + + +def verify_file(file_path, file_type, logger): + pdf_path = "" + is_pure_image_flag = False # 判断是否为纯图片类型的docx -def preprocess_files(output_folder, file_path, file_type,logger): - logger.info("starting 文件预处理...") - start_time = time.time() - is_pure_image_flag=False #判断是否为纯图片类型的docx - pdf_path="" # 根据文件类型处理文件路径 if file_type == 1: # docx # docx_path = file_path if is_pure_image(file_path): - is_pure_image_flag=True + is_pure_image_flag = True else: pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理 elif file_type == 2: # pdf @@ -36,7 +37,37 @@ def preprocess_files(output_folder, file_path, file_type,logger): pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理 else: logger.error("Unsupported file type provided. Preprocessing halted.") + return None, None + + return pdf_path, is_pure_image_flag + + +def preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger): + logger.info("starting 文件预处理...") + start_time = time.time() + # is_pure_image_flag=False #判断是否为纯图片类型的docx + # pdf_path="" + # # 根据文件类型处理文件路径 + # if file_type == 1: # docx + # # docx_path = file_path + # if is_pure_image(file_path): + # is_pure_image_flag=True + # else: + # pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理 + # elif file_type == 2: # pdf + # pdf_path = file_path + # # docx_path = pdf2docx(pdf_path) + # elif file_type == 3: # doc + # # docx_path = doc2docx(file_path) + # pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理 + # else: + # logger.error("Unsupported file type provided. Preprocessing halted.") + # return None + pdf_path, is_pure_image_flag = verify_file(file_path, file_type, logger) + require_pdf_path, _ = verify_file(require_file_path, require_file_type, logger) + if pdf_path is None: return None + if not is_pure_image_flag: #大多数情况 不是纯图片doc/docx # 调用截取PDF多次 truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') @@ -44,7 +75,9 @@ def preprocess_files(output_folder, file_path, file_type,logger): truncate_files=['','','','','','',file_path,''] #纯图片,无需切片 # print("切割出的文件:"+str(truncate_files)) # 处理各个部分 - procurement_path = truncate_files[5] # 采购需求 + # procurement_path = truncate_files[5] # 采购需求 + procurement_path = truncate_files[5] if not require_pdf_path else merge_pdf_to(require_pdf_path, truncate_files[5], logger) # 采购需求 + # procurement_path = truncate_files[5] if not require_pdf_path else merge_pdfs([truncate_files[5], require_pdf_path], truncate_files[5]) evaluation_method_path = truncate_files[1] # 评标办法 qualification_path = truncate_files[2] # 资格审查 tobidders_notice_path = truncate_files[4] # 投标人须知正文 @@ -223,10 +256,10 @@ def post_process_baseinfo(base_info,logger): logger.error(f"Error in post_process_baseinfo: {e}") return base_info, [] # 返回空列表 -def goods_bid_main(output_folder, file_path, file_type, unique_id): +def goods_bid_main(output_folder, file_path, file_type, require_file_path, require_file_type, unique_id): logger = get_global_logger(unique_id) # 预处理文件,获取处理后的数据 - processed_data = preprocess_files(output_folder, file_path, file_type,logger) + processed_data = preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger) if not processed_data: error_response = { 'error': '文件预处理失败。请检查文件类型并重试。'