From 979d7fac21918cbb79b6a0bdc0e1ffac28c90efb Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 14 Feb 2025 15:46:07 +0800 Subject: [PATCH] =?UTF-8?q?2.14=20=E5=B0=9D=E8=AF=95=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E5=86=85=E5=AD=98=E6=B3=84=E6=BC=8F=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/format_change.py | 25 +- flask_app/general/insert_del_pagemark.py | 107 ++++---- flask_app/general/llm/通义千问long.py | 3 - flask_app/general/merge_pdfs.py | 7 +- flask_app/general/截取pdf通用函数.py | 297 ++++++++++++---------- flask_app/general/无效标和废标公共代码.py | 6 +- flask_app/general/读取文件/clean_pdf.py | 119 +++++---- flask_app/routes/判断是否是招标文件.py | 12 +- flask_app/start_up.py | 20 +- flask_app/工程标/截取pdf工程标版.py | 4 + flask_app/货物标/截取pdf货物标版.py | 115 +++++---- 11 files changed, 387 insertions(+), 328 deletions(-) diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 23307a1..de96d19 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -9,12 +9,10 @@ from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_ima def download_file(url, local_filename,enable=False): """ 下载文件并保存到本地,基于Content-Type设置文件扩展名。 - 参数: - url (str): 文件的URL地址。 - local_filename (str): 本地保存的文件名(不含扩展名)。 - enable: 是否需要判断为扫描型/纯图片文件 - 返回: - tuple: (完整文件名, 文件类型代码) 文件类型代码: @@ -31,10 +29,10 @@ def download_file(url, local_filename,enable=False): # 获取Content-Type并猜测文件扩展名 content_type = response.headers.get('Content-Type', '') extension = mimetypes.guess_extension(content_type, strict=False) or '.docx' - - # 分离文件名和现有扩展名 - base, ext = os.path.splitext(local_filename) - if ext.lower() != extension: + extension_lower = extension.lower() #显式地将 extension 转换为小写(存入 extension_lower),更加健壮 + # 根据传入的 local_filename 构造完整的文件名 + base, current_ext = os.path.splitext(local_filename) + if current_ext.lower() != extension_lower: full_filename = base + extension else: full_filename = local_filename @@ -94,6 +92,7 @@ def local_file_2_url(file_path, url): print("format_change 文件上传成功") receive_file_response = response.content.decode('utf-8') receive_file_json = json.loads(receive_file_response) + print(receive_file_json) receive_file_url = receive_file_json["data"] else: @@ -207,17 +206,17 @@ def get_pdf_page_count(file_path): if __name__ == '__main__': # 替换为你的文件路径和API URL # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf" - # local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx" + local_path_in = r"C:\Users\Administrator\Downloads\基础公司枞阳经开区新能源汽车零部件产业园基础设施建设项目二期桩基工程劳务分包.doc" # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf" - local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf" - intermediate_docx = pdf2docx(local_path_in) - if intermediate_docx: - normal_pdf = docx2pdf(intermediate_docx, force=True) + # local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf" + # intermediate_docx = pdf2docx(local_path_in) + # if intermediate_docx: + # normal_pdf = docx2pdf(intermediate_docx, force=True) # # downloaded_file=pdf2docx(local_path_in) - # downloaded_file=docx2pdf(local_path_in) - # print(downloaded_file) + downloaded_file=docx2pdf(local_path_in) + print(downloaded_file) # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3" # local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf' diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py index 2b95639..228ad31 100644 --- a/flask_app/general/insert_del_pagemark.py +++ b/flask_app/general/insert_del_pagemark.py @@ -23,37 +23,33 @@ def insert_mark(input_pdf_path): # 遍历每一页 for page_num in range(total_pages): + # 添加原始页面 page = pdf_reader.pages[page_num] pdf_writer.add_page(page) - # 创建一个内存中的PDF,用于存放带有文本的空白页 - packet = BytesIO() - # 获取当前页面的宽度和高度 - page_width = float(page.mediabox.width) - page_height = float(page.mediabox.height) - # 使用reportlab创建一个新的PDF页面 - c = canvas.Canvas(packet, pagesize=(page_width, page_height)) + # 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记 + with BytesIO() as packet: + # 获取当前页面的宽度和高度 + page_width = float(page.mediabox.width) + page_height = float(page.mediabox.height) + # 使用 reportlab 创建一个新的 PDF 页面,页面大小与当前页面相同 + c = canvas.Canvas(packet, pagesize=(page_width, page_height)) - # 计算文本的位置(单位:点,1厘米 ≈ 28.35点) - x_position = 2.3 * cm - y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距 + # 计算文本位置:距左边距 2.3cm,距顶部 0.5cm + x_position = 2.3 * cm + y_position = page_height - 0.5 * cm + c.setFont("Helvetica", 12) + c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]") - # 绘制文本,使用 (page_num + 1) 作为索引 - c.setFont("Helvetica", 12) # 设置字体和大小 - c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]") + # 完成绘制,将内容写入 BytesIO 流 + c.save() - # 完成绘制 - c.save() - - # 将内存中的PDF读入PyPDF2 - packet.seek(0) - new_pdf = PdfReader(packet) - # blank_page = new_pdf.pages[0] - blank_page = copy.deepcopy(new_pdf.pages[0]) - packet.truncate(0) - packet.seek(0) - # 将带有文本的空白页添加到写入器 - pdf_writer.add_page(blank_page) + # 重置流位置,读取刚刚生成的 PDF 页面 + packet.seek(0) + new_pdf = PdfReader(packet) + # 使用 deepcopy 确保页面对象独立,避免后续修改影响已添加页面 + blank_page = copy.deepcopy(new_pdf.pages[0]) + pdf_writer.add_page(blank_page) # 将所有页面写入输出的PDF文件 with open(output_pdf_path, 'wb') as output_file: @@ -65,10 +61,14 @@ def insert_mark(input_pdf_path): print(f"发生错误: {e}") return input_pdf_path - def delete_mark(docx_path): + """ + 删除文档中包含标记文本 "[$$index_mark_" 的段落, + 如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。 + 修改后的文档保存为 invalid_del.docx,并返回新文件路径。 + """ try: - docx = Document(docx_path) + doc = Document(docx_path) except KeyError as e: print(f"Error opening document: {e}") return "" @@ -76,30 +76,41 @@ def delete_mark(docx_path): print(f"Invalid package: {e}") return "" - # 继续处理文档 - find_flag = False - for para in docx.paragraphs: - # 匹配标记: [$$index_mark_X$$] + # 收集所有需要删除的段落,避免在遍历时直接修改列表 + paragraphs_to_remove = [] + paragraphs = doc.paragraphs + i = 0 + while i < len(paragraphs): + para = paragraphs[i] if "[$$index_mark_" in para.text: - para._element.getparent().remove(para._element) # 删标记 - find_flag = True - if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符 - para._element.getparent().remove(para._element) - find_flag = False - dir_path = os.path.dirname(docx_path) - new_file_path = os.path.join(dir_path, 'invalid_del.docx') + paragraphs_to_remove.append(para) + # 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr",也删除该段落 + if i + 1 < len(paragraphs): + next_para = paragraphs[i + 1] + if "w:sectPr" in next_para._element.xml: + paragraphs_to_remove.append(next_para) + i += 1 # 跳过下一个段落,避免重复处理 + i += 1 - # 保存修改后的文档 - docx.save(new_file_path) + # 从文档 XML 中移除收集到的段落 + for para in paragraphs_to_remove: + parent = para._element.getparent() + if parent is not None: + parent.remove(para._element) + + new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx') + doc.save(new_file_path) return new_file_path + + if __name__ == '__main__': # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf' - input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\ztbfile_invalid.pdf' - output=insert_mark(input) - # doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx' - # res=delete_mark(doc_path) - # if res: - # print(res) - # else: - # print("No") \ No newline at end of file + # input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf' + # output=insert_mark(input) + doc_path = r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\invalid_added.docx' + res=delete_mark(doc_path) + if res: + print(res) + else: + print("No") \ No newline at end of file diff --git a/flask_app/general/llm/通义千问long.py b/flask_app/general/llm/通义千问long.py index ab4911d..cf81acb 100644 --- a/flask_app/general/llm/通义千问long.py +++ b/flask_app/general/llm/通义千问long.py @@ -6,9 +6,6 @@ import time from pathlib import Path from openai import OpenAI import os - -from flask_app.general.llm.大模型通用函数 import get_total_tokens -from flask_app.general.llm.qianwen_turbo import qianwen_turbo from flask_app.general.llm.大模型通用函数 import extract_error_details, shared_rate_limit file_write_lock = threading.Lock() diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index cf60244..874365f 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -1,8 +1,6 @@ import os - import fitz from PyPDF2 import PdfReader, PdfWriter - from flask_app.general.截取pdf通用函数 import create_get_text_function @@ -67,6 +65,7 @@ def merge_with_fitz(paths, output_path): for path in paths: if not path.strip(): continue + pdf_doc = None try: pdf_doc = fitz.open(path) get_text = create_get_text_function('fitz', pdf_doc) @@ -88,7 +87,9 @@ def merge_with_fitz(paths, output_path): pdf_doc.close() except Exception as e: print(f"文件 '{path}' 无法使用 fitz 处理,错误: {e}") - continue + finally: + if pdf_doc is not None: + pdf_doc.close() if merged_pdf.page_count == 0: merged_pdf.close() diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index bd1dff0..20544de 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -1,5 +1,6 @@ # flask_app/general/截取pdf通用函数.py import concurrent.futures +import io import os import fitz @@ -37,8 +38,10 @@ def get_start_and_common_header(input_path, end_page): - common_header: 公共页眉内容。 - last_begin_index: 起始页的页码索引。 """ + pdf_document = None try: # 提取公共页眉 + common_header = extract_common_header(input_path) # 假设该函数已定义 last_begin_index = -1 @@ -48,7 +51,6 @@ def get_start_and_common_header(input_path, end_page): ) # 新增目录匹配模式 catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) - try: # 尝试使用 PyPDF2 读取 PDF pdf_document = PdfReader(input_path) @@ -86,58 +88,77 @@ def get_start_and_common_header(input_path, end_page): except Exception as e: print(f"Error in get_start_and_common_header: {e}") return "", -1 # 根据需求调整返回值 + finally: + # 如果使用的是 PyMuPDF,记得关闭文档 + if pdf_document and hasattr(pdf_document, "close"): + pdf_document.close() def check_pdf_pages(pdf_path, mode, logger): + pdf_document = None try: try: + # 尝试使用 PyPDF2 读取 PDF(新版可直接传入文件路径) pdf_document = PdfReader(pdf_path) num_pages = len(pdf_document.pages) # logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}") except Exception as e_pypdf2: - print(f"check_pdf_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + # 如果 PyPDF2 读取失败,尝试使用 PyMuPDF 读取 PDF pdf_document = fitz.open(pdf_path) num_pages = pdf_document.page_count + if num_pages <= 30: logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。") if mode == 'goods': return True, ['', '', '', '', '', '', pdf_path, ''] else: return True, ['', '', '', '', '', pdf_path, ''] - # 若页数大于30页,返回None表示继续处理 + + # 若页数大于30页,返回 False 表示继续处理 return False, [] + except Exception as e: logger.error(f"无法读取 PDF 页数: {e}") - # 返回空列表意味着无法执行后续处理逻辑 if mode == 'goods': return True, ['', '', '', '', '', '', pdf_path, ''] else: return True, ['', '', '', '', '', pdf_path, ''] + finally: + # 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源 + if pdf_document and hasattr(pdf_document, "close"): + pdf_document.close() + + def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header): + pdf_document = None + using_pypdf2 = False + try: if start_page is None or end_page is None: print("错误: start_page 或 end_page 为 None") return "" - # 尝试使用 PyPDF2 读取 PDF + # 尝试使用 PyPDF2 读取 PDF(新版可以直接传入路径) try: pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) - get_text = lambda p: pdf_document.pages[p].extract_text() + # 定义提取文本函数 + get_text = create_get_text_function('pypdf2', pdf_document) using_pypdf2 = True - # print("使用 PyPDF2 成功读取 PDF 文件。") + # print("使用 PyPDF2 成功读取 PDF 文件。") except Exception as e_pypdf2: - print(f"save_extracted_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") - # 尝试使用 PyMuPDF 读取 PDF + print(f"save_extracted_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF try: pdf_document = fitz.open(pdf_path) total_pages = pdf_document.page_count - get_text = lambda p: pdf_document.load_page(p).get_text() + get_text = create_get_text_function('fitz', pdf_document) using_pypdf2 = False # print("使用 PyMuPDF 成功读取 PDF 文件。") except Exception as e_fitz: - print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + print(f"save_extracted_pages: 使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") return "" # 检查页面范围的有效性 @@ -148,6 +169,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + # 如果 output_suffix 为 'notice' 并且 start_page > 0,则另存“目录前”的页面 if output_suffix == 'notice' and start_page > 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") toc_page = -1 @@ -158,6 +180,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s except Exception as e_extract: print(f"提取第 {page_num} 页文本时出错: {e_extract}") continue + cleaned_text = clean_page_content(text, common_header) if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE): toc_page = page_num @@ -204,8 +227,8 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}") continue try: - with open(output_pdf_path, 'wb') as f: - output_doc.write(f) + with open(output_pdf_path, 'wb') as f_out: + output_doc.write(f_out) print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") return output_pdf_path except Exception as e_write: @@ -213,24 +236,28 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s return "" else: output_doc = fitz.open() - for page_num in range(start_page, end_page + 1): - try: - output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) - except Exception as e_page: - print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}") - continue try: + for page_num in range(start_page, end_page + 1): + try: + output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) + except Exception as e_page: + print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}") + continue output_doc.save(output_pdf_path) - output_doc.close() print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") return output_pdf_path except Exception as e_write: print(f"保存截取的页面时出错: {e_write}") return "" + finally: + output_doc.close() except Exception as e: print(f"发生未知错误: {e}") return "" - + finally: + # 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源 + if pdf_document and not using_pypdf2 and hasattr(pdf_document, "close"): + pdf_document.close() def is_pdf_or_doc(filename): # 判断文件是否为PDF或Word文档 @@ -242,14 +269,14 @@ def convert_to_pdf(file_path): return docx2pdf(file_path) return file_path -def get_invalid_file(file_path, output_folder, common_header, begin_page): +def get_invalid_file(pdf_path, output_folder, common_header, begin_page): """ 从指定的PDF文件中提取无效部分并保存到输出文件夹中。 begin_pattern匹配开头 end_pattern匹配结尾 页数小于100不进行begin_pattern,默认start_page=0 页数大于200时end_pattern从前往后匹配 Args: - file_path (str): 输入的PDF文件路径。 + pdf_path (str): 输入的PDF文件路径。 output_folder (str): 提取后文件的输出文件夹路径。 common_header (str): 公共头部文本,用于清理每页的内容。 @@ -284,23 +311,22 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) + pdf_document = None try: - # 打开PDF文件 + # 尝试使用 PyPDF2 读取 PDF try: - pdf_document = PdfReader(file_path) - pdf_lib = 'pypdf2' - # 获取总页数(对于 pypdf2 的 PDFDocument) + pdf_document = PdfReader(pdf_path) + get_text = create_get_text_function('pypdf2', pdf_document) total_pages = len(pdf_document.pages) except Exception as e: - print(f"get_invalid_file:使用 pypdf2 读取失败,错误信息: {e},切换至 fitz。") - pdf_document = fitz.open(file_path) - pdf_lib = 'fitz' - # 获取总页数(对于 fitz 的 PDFDocument) + pdf_document = fitz.open(pdf_path) + get_text = create_get_text_function('fitz', pdf_document) total_pages = pdf_document.page_count + if total_pages <= 50: - print(f"PDF页数({total_pages}) <= 50,直接返回文件路径:{file_path}") - return file_path, 0 - get_text = create_get_text_function(pdf_lib, pdf_document) + print(f"PDF页数({total_pages}) <= 50,直接返回文件路径: {pdf_path}") + return pdf_path, 0 + # 提取并清理每页的文本内容 page_texts = [] for i in range(total_pages): @@ -308,11 +334,12 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): cleaned_text = clean_page_content(text, common_header) if text else "" page_texts.append(cleaned_text) - # 定义查找起始页的函数 + # 内部函数:查找起始页 def find_start_page(begin_page): - for pattern in begin_patterns: + for pattern in begin_patterns: # 需提前定义好 for i in range(begin_page, min(begin_page + 30, total_pages)): text = page_texts[i] + # 若存在排除模式则跳过 if regex.search(exclusion_pattern, text): continue if regex.search(pattern, text): @@ -321,22 +348,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): # print(f"未在前30页找到模式: {pattern.pattern}") return 0 # 默认从第一页开始 - # 定义查找结束页的函数 + # 内部函数:查找结束页 def find_end_page(): if total_pages > 200: - # print("总页数大于200页,结束页从前往后查找,跳过前30页。") #且先匹配合同 - for pattern in end_patterns: + # 当页数较多时,从第31页开始正向查找结束模式 + for pattern in end_patterns: # 需提前定义好 for i in range(30, total_pages): text = page_texts[i] - exclusion_pattern = regex.compile( - r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE) - if regex.search(exclusion_pattern,text): + # 定义局部的排除模式 + exclusion_pat = regex.compile( + r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', + regex.MULTILINE + ) + if regex.search(exclusion_pat, text): continue if regex.search(pattern, text): # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i else: - # print("结束页从后往前查找,确保只剩前30页时停止。") + # 当页数较少时,从后向前查找(至少保留前30页) for pattern in end_patterns: for i in range(total_pages - 1, 29, -1): text = page_texts[i] @@ -344,77 +374,76 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i - # 如果没有匹配到,设置end_page逻辑 + # 若未匹配到结束模式,设置默认结束页 if total_pages > 200: - print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {end_page}。" + file_path) + print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}") return max(100, int(total_pages * 1 / 2)) elif 100 < total_pages <= 200: - print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。" + file_path) + print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。 " + pdf_path) return max(100, int(total_pages * 2 / 3)) else: - print("未找到结束模式,设置结束页为最后一页。" + file_path) + print("未找到结束模式,设置结束页为最后一页。 " + pdf_path) return total_pages - 1 - # 根据总页数决定是否查找起始页 + # 根据总页数决定是否并发查找起始页与结束页 if total_pages < 100: - # print("总页数少于100页,默认起始页为第一页。") - start_page = 0 + start_page_found = 0 with concurrent.futures.ThreadPoolExecutor() as executor: future_end = executor.submit(find_end_page) - end_page = future_end.result() + end_page_found = future_end.result() else: with concurrent.futures.ThreadPoolExecutor() as executor: future_start = executor.submit(find_start_page, begin_page) future_end = executor.submit(find_end_page) - start_page = future_start.result() - end_page = future_end.result() - # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}") - # 验证页码范围 - if start_page > end_page: - print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})") + start_page_found = future_start.result() + end_page_found = future_end.result() + + # 检查页码范围是否有效 + if start_page_found > end_page_found: + print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})") return "", 0 - # 调用已实现的保存函数 + # 调用已实现的保存函数,保存提取的页面 output_path = save_extracted_pages( - pdf_path=file_path, + pdf_path=pdf_path, output_folder=output_folder, - start_page=start_page, - end_page=end_page, + start_page=start_page_found, + end_page=end_page_found, output_suffix="invalid", common_header=common_header ) - - # print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}") - return output_path, end_page + return output_path, end_page_found except Exception as e: - print(f"处理文件 {file_path} 时发生错误: {e}") + print(f"处理文件 {pdf_path} 时发生错误: {e}") return "", 0 + finally: + # 确保释放资源(对 fitz 的 pdf_document 需要关闭) + if pdf_document and hasattr(pdf_document, "close"): + pdf_document.close() + def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, output_suffix="normal", invalid_endpage=-1): start_page = None end_page = None flag = True - + # 先尝试使用 PyPDF2 try: - # 尝试使用 PyPDF2 读取 PDF pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) get_text = create_get_text_function('pypdf2', pdf_document) - # print("使用 PyPDF2 成功读取 PDF 文件。") except Exception as e_pypdf2: print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + # 如果 PyPDF2 失败,就再试 PyMuPDF try: - # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF pdf_document = fitz.open(pdf_path) total_pages = pdf_document.page_count get_text = create_get_text_function('fitz', pdf_document) - # print("使用 PyMuPDF 成功读取 PDF 文件。") except Exception as e_fitz: print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") - return None, None # 或者根据需求抛出异常 + return None, None end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages) for i in range(begin_page, end_limit): text = get_text(i) @@ -447,7 +476,9 @@ def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, comm if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text): end_page = i break - + # 如果使用的是 PyMuPDF,需要手动关闭文档 + if pdf_document and isinstance(pdf_document, fitz.Document): + pdf_document.close() return start_page, end_page @@ -534,7 +565,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h """ exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) - def run_extraction(pdf_document,begin_pattern, extraction_stage='first'): + def run_extraction(begin_pattern, extraction_stage='first'): """ 使用提供的 begin_pattern 运行提取过程。 根据 extraction_stage 判断是“第一次提取”还是“第三次提取”, @@ -554,14 +585,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h end_page = None combined_mid_pattern = None # 中间页的组合模式 catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) - total_pages = len(pdf_document.pages) if isinstance(pdf_document, PdfReader) else pdf_document.page_count end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages) # ===== 遍历每一页,执行提取逻辑 ===== for i in range(end_limit): try: - text = pdf_document.pages[i].extract_text() if isinstance(pdf_document, - PdfReader) else pdf_document.load_page(i).get_text() - text = text or "" + text = get_text(i) cleaned_text = clean_page_content(text, common_header) except Exception as e: print(f"提取第 {i} 页文本时出错: {e}") @@ -621,60 +649,77 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h path2 = path1 return path1, path2 + pdf_document = None try: - pdf_document = PdfReader(pdf_path) - total_pages = len(pdf_document.pages) - except Exception as e_pypdf2: - print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") try: - pdf_document = fitz.open(pdf_path) - total_pages = pdf_document.page_count - # print("使用 PyMuPDF 成功读取 PDF 文件。") - except Exception as e_fitz: - print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") - return "", "" # 或者根据需求抛出异常 + pdf_document = PdfReader(pdf_path) + get_text = create_get_text_function('pypdf2', pdf_document) + total_pages = len(pdf_document.pages) + except Exception as e_pypdf2: + print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + try: + pdf_document = fitz.open(pdf_path) + get_text = create_get_text_function('fitz', pdf_document) + total_pages = pdf_document.page_count + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return "", "" # 或者根据需求抛出异常 - # 第一次提取尝试,使用初始的 begin_pattern - begin_pattern = regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|' - r'(?= max_pages: + if i >= max_pages or done_event.is_set(): break - # 尝试在 page_timeout 内获取文本 text = read_page_text_with_timeout(page, page_timeout) if text is None: - print(f"第 {i+1} 页文本提取超时,直接判定为扫描件。") - result_container[0] = True - done_event.set() + print(f"Page {i + 1} timeout") + result_queue.put(True) return if text.strip(): - print(f"第 {i+1} 页检测到文本,判定为非扫描件。") - result_container[0] = False - done_event.set() + print(f"Text found on page {i + 1}") + result_queue.put(False) return + + result_queue.put(True) except Exception as e: - print("处理 PDF 文件时发生异常:", e) - result_container[0] = True + print(f"Processing error: {str(e)}") + result_queue.put(True) + finally: done_event.set() - return - print(f"前 {max_pages} 页均未检测到文本,判定为扫描件。") - result_container[0] = True - done_event.set() - - result_container = [None] # 用于在子线程中传递结果 - done = threading.Event() - thread = threading.Thread(target=core_check, args=(result_container, done), daemon=True) + thread = threading.Thread(target=core_check, daemon=True) thread.start() + try: + result = result_queue.get(timeout=overall_timeout) + except queue.Empty: + print(f"Overall timeout after {overall_timeout}s") + return True # 或定义特殊超时状态码 - # 等待整体流程结束,最多等待 overall_timeout 秒 - finished_in_time = done.wait(overall_timeout) - if not finished_in_time: - print(f"整体检查超时(超过 {overall_timeout} 秒),返回默认结果。") - return True # 或者根据需要返回其它默认值 - - # 打印最终结果调试信息 - if result_container[0]: - print("最终结果:认为是扫描件(未检测到有效文本或发生单页超时)") - else: - print("最终结果:认为非扫描件(检测到有效文本)") - return result_container[0] + return result def is_pure_image(docx_path, percentage=0.3): """ diff --git a/flask_app/routes/判断是否是招标文件.py b/flask_app/routes/判断是否是招标文件.py index beb17dd..6fce979 100644 --- a/flask_app/routes/判断是否是招标文件.py +++ b/flask_app/routes/判断是否是招标文件.py @@ -12,8 +12,10 @@ def judge_zbfile_exec(file_path): start_time = time.time() # 检查文件是否为PDF格式 if file_path.lower().endswith('.pdf'): - reader = PdfReader(file_path) - num_pages = len(reader.pages) + # 使用 with 语句确保文件关闭 + with open(file_path, 'rb') as f: + reader = PdfReader(f) + num_pages = len(reader.pages) if num_pages <= 5: return False # 模拟使用大模型进行判断 @@ -32,11 +34,7 @@ def judge_zbfile_exec(file_path): print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒") print(f"判断是否属于招标文件:{model_res}") - # 根据模型返回结果判断 - if '否' in model_res: - return False - else: - return True + return '否' not in model_res except Exception as e: print(f"处理文件时出错: {e}") diff --git a/flask_app/start_up.py b/flask_app/start_up.py index ae18e77..c3b8a9a 100644 --- a/flask_app/start_up.py +++ b/flask_app/start_up.py @@ -1,4 +1,5 @@ # flask_app/start_up.py +import os from flask import Flask, g from flask_app.ConnectionLimiter import ConnectionLimiter @@ -11,16 +12,10 @@ from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids from flask_app.routes.judge_zbfile import judge_zbfile_bp -class FlaskAppWithLimiter(Flask): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # 初始化一个字典来存储每个蓝图的限流器 - self.connection_limiters = {} - def create_app(): - app = FlaskAppWithLimiter(__name__) # 创建全局日志记录器 + app = Flask(__name__) app.global_logger = create_logger_main('global_log') # 全局日志记录器 # 注册蓝图 app.register_blueprint(get_deviation_bp) @@ -28,11 +23,14 @@ def create_app(): app.register_blueprint(upload_bp) app.register_blueprint(test_zbparse_bp) app.register_blueprint(judge_zbfile_bp) - # app.connection_limiters['upload'] = ConnectionLimiter(max_connections=100) - # app.connection_limiters['get_deviation'] = ConnectionLimiter(max_connections=100) - # app.connection_limiters['default'] = ConnectionLimiter(max_connections=100) - # app.connection_limiters['judge_zbfile'] = ConnectionLimiter(max_connections=100) + @app.teardown_request + def check_fd(exception=None): + logger = g.logger + end_fd = os.listdir('/proc/self/fd') + leaked = set(end_fd) - set(g.start_fd) + if leaked: + logger.warning(f"潜在FD泄漏: {len(leaked)}个未关闭") # @app.teardown_request # def teardown_request(exception): # # 接口请求之后都会执行该代码,做一些清理工作 diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index cf0c35f..ec51d0d 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -29,6 +29,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l 处理 PDF 文件,作为 truncate_pdf_main 的后备方法。 此函数将在所有模式对都失败后调用。 """ + pdf_document=None try: try: pdf_document = PdfReader(pdf_path) @@ -109,6 +110,9 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l except Exception as e: print(f"Error in extract_pages_twice: {e}") return "" + finally: + if pdf_document and hasattr(pdf_document, 'close'): + pdf_document.close() def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1): diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index ebb6138..21cc27a 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -55,7 +55,6 @@ def get_patterns_for_evaluation_method(): return begin_pattern, end_pattern def extract_pages_qualification(pdf_path, begin_page, common_header): - pdf_document = PdfReader(pdf_path) # 开始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = regex.compile( r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)(?!的)', @@ -84,65 +83,69 @@ def extract_pages_qualification(pdf_path, begin_page, common_header): end_page = None include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"] exclude_keywords = ["声明函", "承诺函"] - + pdf_document = None try: - # 尝试使用 PyPDF2 读取 PDF - pdf_document = PdfReader(pdf_path) - total_pages = len(pdf_document.pages) - get_text = create_get_text_function('pypdf2', pdf_document) - # print("使用 PyPDF2 成功读取 PDF 文件。") - except Exception as e_pypdf2: - print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") try: - # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF - pdf_document = fitz.open(pdf_path) - total_pages = pdf_document.page_count - get_text = create_get_text_function('fitz', pdf_document) - # print("使用 PyMuPDF 成功读取 PDF 文件。") - except Exception as e_fitz: - print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") - return None, None # 或者根据需求抛出异常 - # 从指定的开始页开始遍历 - for i in range(begin_page, total_pages): - text = get_text(i) - if text: - cleaned_text = clean_page_content(text, common_header) - # 优先检查是否匹配优先模式 - priority_match = priority_pattern.search(cleaned_text) - if priority_match and start_page is None: - start_page = i - print(f"匹配到priority_pattern,设置起始页为: {start_page}") - continue - else: - # 如果未匹配优先模式,则按照一般模式进行判断 - if ( - any(keyword in cleaned_text for keyword in include_keywords) and - all(keyword not in cleaned_text for keyword in exclude_keywords) and - start_page is None - ): - if begin_pattern.search(cleaned_text): - start_page = i - print(f"匹配到附录等模式begin_pattern,设置起始页为: {start_page}") - continue + # 尝试使用 PyPDF2 读取 PDF + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) + get_text = create_get_text_function('pypdf2', pdf_document) + # print("使用 PyPDF2 成功读取 PDF 文件。") + except Exception as e_pypdf2: + print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + try: + # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF + pdf_document = fitz.open(pdf_path) + total_pages = pdf_document.page_count + get_text = create_get_text_function('fitz', pdf_document) + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return None, None # 或者根据需求抛出异常 + # 从指定的开始页开始遍历 + for i in range(begin_page, total_pages): + text = get_text(i) + if text: + cleaned_text = clean_page_content(text, common_header) + # 优先检查是否匹配优先模式 + priority_match = priority_pattern.search(cleaned_text) + if priority_match and start_page is None: + start_page = i + print(f"匹配到priority_pattern,设置起始页为: {start_page}") + continue + else: + # 如果未匹配优先模式,则按照一般模式进行判断 + if ( + any(keyword in cleaned_text for keyword in include_keywords) and + all(keyword not in cleaned_text for keyword in exclude_keywords) and + start_page is None + ): + if begin_pattern.search(cleaned_text): + start_page = i + print(f"匹配到附录等模式begin_pattern,设置起始页为: {start_page}") + continue - # 确定结束页 - 附录、附件、附表等 - if start_page is not None and end_pattern_attachment.search(cleaned_text): - # 额外检查当前页面是否不包含任何 include_keywords - if not any(keyword in cleaned_text for keyword in include_keywords): + # 确定结束页 - 附录、附件、附表等 + if start_page is not None and end_pattern_attachment.search(cleaned_text): + # 额外检查当前页面是否不包含任何 include_keywords + if not any(keyword in cleaned_text for keyword in include_keywords): + if i > start_page: + end_page = i + print(f"qualification:找到结束页 (附件类): {end_page}") + break # 找到结束页后退出循环 + else: + print(f"当前页面匹配附件结束模式但包含 include_keywords,继续查找。页面: {i}") + + # 确定结束页 - 章节标题 + elif start_page is not None and end_pattern_chapter.search(cleaned_text): if i > start_page: end_page = i - print(f"qualification:找到结束页 (附件类): {end_page}") + print(f"qualification:找到结束页 (章节标题): {end_page}") break # 找到结束页后退出循环 - else: - print(f"当前页面匹配附件结束模式但包含 include_keywords,继续查找。页面: {i}") - - # 确定结束页 - 章节标题 - elif start_page is not None and end_pattern_chapter.search(cleaned_text): - if i > start_page: - end_page = i - print(f"qualification:找到结束页 (章节标题): {end_page}") - break # 找到结束页后退出循环 - return start_page, end_page + return start_page, end_page + finally: + if pdf_document and hasattr(pdf_document, 'close'): + pdf_document.close() def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger): @@ -322,8 +325,8 @@ if __name__ == "__main__": pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" - output_folder = r"C:\Users\Administrator\Desktop\货物标\output5" + output_folder = r"C:\Users\Administrator\Desktop\货物标\output4" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" - selection = 5 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid + selection = 4 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) print(generated_files)