From 6618c6d24792a225d206c0c7a05200ed6e66b2a0 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 21 Jan 2025 14:21:49 +0800 Subject: [PATCH 1/2] =?UTF-8?q?1.20=20pypdf2=E8=AF=BB=E5=8F=96=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E5=8F=AF=E8=BD=AC=E4=B8=BA=E4=BD=BF=E7=94=A8fitz?= =?UTF-8?q?=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/clean_pdf.py | 124 ++++-- flask_app/general/截取pdf_main.py | 4 +- flask_app/general/截取pdf通用函数.py | 382 +++++++++++++----- .../general/投标人须知正文提取指定内容.py | 29 +- .../投标人须知正文条款提取成json文件.py | 4 +- flask_app/general/读取文件/按页读取pdf.py | 2 +- flask_app/工程标/截取pdf工程标版.py | 33 +- flask_app/货物标/截取pdf货物标版.py | 51 ++- 8 files changed, 457 insertions(+), 172 deletions(-) diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py index 7a6b915..f6c1b2f 100644 --- a/flask_app/general/clean_pdf.py +++ b/flask_app/general/clean_pdf.py @@ -1,17 +1,38 @@ import re + +import fitz from PyPDF2 import PdfReader def extract_common_header(pdf_path): - def get_headers(pdf_document, start_page, pages_to_read): + """ + 提取 PDF 文件的公共页眉。 + + 参数: + - pdf_path: PDF 文件路径。 + + 返回: + - common_header: 公共页眉内容,字符串。如果未找到,则返回空字符串。 + """ + + def get_headers(pdf_document, start_page, pages_to_read, is_pypdf2): headers = [] - for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))): - page = pdf_document.pages[i] - text = page.extract_text() or "" - if text: - # 只取每页的前三行,去除前后的空白字符 - first_lines = [line.strip() for line in text.strip().split('\n')[:3]] - headers.append(first_lines) + for i in range(start_page, min(start_page + pages_to_read, + len(pdf_document.pages) if is_pypdf2 else pdf_document.page_count)): + try: + if is_pypdf2: + page = pdf_document.pages[i] + text = page.extract_text() or "" + else: + page = pdf_document.load_page(i) + text = page.get_text() or "" + if text: + # 只取每页的前三行,去除前后的空白字符 + first_lines = [line.strip() for line in text.strip().split('\n')[:3]] + headers.append(first_lines) + except Exception as e: + print(f"提取第 {i} 页文本时出错: {e}") + continue return headers def find_common_headers(headers): @@ -49,45 +70,66 @@ def extract_common_header(pdf_path): return common_headers - pdf_document = PdfReader(pdf_path) - total_pages = len(pdf_document.pages) + try: + # 尝试使用 PyPDF2 读取 PDF + try: + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) + is_pypdf2 = True + # print("使用 PyPDF2 成功读取 PDF 文件。") + except Exception as e_pypdf2: + print(f"extract_common_header:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + try: + # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF + pdf_document = fitz.open(pdf_path) + total_pages = pdf_document.page_count + is_pypdf2 = False + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"extract_common_header:使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return "" # 或者根据需求抛出异常 - # 定义两个提取策略 - strategies = [] - if total_pages >= 3: - # 策略1:中间的3页 - middle_page = total_pages // 2 - start_page = max(0, middle_page - 1) - strategies.append((start_page, 3)) - elif total_pages == 2: - # 策略1:2页 - strategies.append((0, 2)) - else: - # 策略1:1页 - strategies.append((0, 1)) + # 定义两个提取策略 + strategies = [] + if total_pages >= 3: + # 策略1:中间的3页 + middle_page = total_pages // 2 + start_page = max(0, middle_page - 1) + strategies.append((start_page, 3)) + elif total_pages == 2: + # 策略1:2页 + strategies.append((0, 2)) + else: + # 策略1:1页 + strategies.append((0, 1)) - # 策略2:前三页 - if total_pages >= 3: - strategies.append((0, 3)) - elif total_pages == 2: - strategies.append((0, 2)) - elif total_pages == 1: - strategies.append((0, 1)) + # 策略2:前三页 + if total_pages >= 3: + strategies.append((0, 3)) + elif total_pages == 2: + strategies.append((0, 2)) + elif total_pages == 1: + strategies.append((0, 1)) - common_headers = [] + common_headers = [] - for idx, (start, count) in enumerate(strategies): - headers = get_headers(pdf_document, start, count) - if len(headers) < 2: - continue # 需要至少2页来比较 + for idx, (start, count) in enumerate(strategies): + headers = get_headers(pdf_document, start, count, is_pypdf2) + if len(headers) < 2: + continue # 需要至少2页来比较 - current_common = find_common_headers(headers) - if current_common: - common_headers = current_common - break # 找到共同部分后退出 - # 如果没有找到,继续下一个策略 + current_common = find_common_headers(headers) + if current_common: + common_headers = current_common + print(f"使用策略{idx + 1}找到共同的页眉: {common_headers}") + break # 找到共同部分后退出 + # 如果没有找到,继续下一个策略 - return '\n'.join(common_headers) + return '\n'.join(common_headers) + + except Exception as e: + print(f"Error in extract_common_header: {e}") + return "" # 根据需求调整返回值 def clean_page_content(text, common_header): # 首先删除抬头公共部分 diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py index 3bb086c..e2016b6 100644 --- a/flask_app/general/截取pdf_main.py +++ b/flask_app/general/截取pdf_main.py @@ -119,10 +119,10 @@ if __name__ == "__main__": # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" - pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\file1736998876340 (1).doc" + pdf_path=r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719\ztbfile.pdf" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" - output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp" + output_folder = r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719" # selections = [1, 4] # 仅处理 selection 4、1 # selections = [1, 2, 3, 5] # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 361817f..294cdaa 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -1,6 +1,8 @@ # flask_app/general/截取pdf通用函数.py import concurrent.futures import os + +import fitz import regex from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import extract_common_header, clean_page_content @@ -8,36 +10,94 @@ from flask_app.general.format_change import docx2pdf import concurrent.futures +# 定义一个函数用于提取文本 +def get_text_pypdf2(pdf, page_num): + return pdf.pages[page_num].extract_text() or "" + +def get_text_fitz(pdf, page_num): + return pdf.load_page(page_num).get_text() or "" +# 定义一个内部函数来绑定 pdf_document 对象 +def create_get_text_function(pdf_lib, pdf_doc): + if pdf_lib == 'pypdf2': + return lambda page_num: get_text_pypdf2(pdf_doc, page_num) + elif pdf_lib == 'fitz': + return lambda page_num: get_text_fitz(pdf_doc, page_num) + else: + raise ValueError("不支持的 PDF 库。") + def get_start_and_common_header(input_path, end_page): - common_header = extract_common_header(input_path) - last_begin_index = 0 - begin_pattern = regex.compile( - r'.*(? end_page: - return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0 - text = page.extract_text() - if text: - cleaned_text = clean_page_content(text, common_header) - # 检查是否存在"目录" - if catalog_pattern.search(cleaned_text): - continue # 如果存在目录,跳过当前页面 - if begin_pattern.search(cleaned_text): - last_begin_index = i # 更新第一个匹配的索引,页码从0开始 - return common_header, last_begin_index - return common_header, last_begin_index + """ + 提取 PDF 文件的公共页眉和起始页索引。 + + 参数: + - input_path: PDF 文件路径。 + - end_page: 遍历的结束页码。 + + 返回: + - common_header: 公共页眉内容。 + - last_begin_index: 起始页的页码索引。 + """ + try: + # 提取公共页眉 + common_header = extract_common_header(input_path) # 假设该函数已定义 + + last_begin_index = 0 + begin_pattern = regex.compile( + r'.*(? end_page: + return common_header, 0 # 如果页码大于 end_page,直接返回 last_begin_index=0 + text = get_text(i) + if text: + cleaned_text = clean_page_content(text, common_header) + # 检查是否存在"目录" + if catalog_pattern.search(cleaned_text): + print(f"页面 {i} 包含目录,跳过。") + continue # 如果存在目录,跳过当前页面 + if begin_pattern.search(cleaned_text): + last_begin_index = i # 更新第一个匹配的索引,页码从0开始 + print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}") + return common_header, last_begin_index + return common_header, last_begin_index + except Exception as e: + print(f"Error in get_start_and_common_header: {e}") + return "", 0 # 根据需求调整返回值 def check_pdf_pages(pdf_path, mode, logger): try: - reader = PdfReader(pdf_path) - num_pages = len(reader.pages) - logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") + try: + pdf_document = PdfReader(pdf_path) + num_pages = len(pdf_document.pages) + # logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}") + except Exception as e_pypdf2: + print(f"check_pdf_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + pdf_document = fitz.open(pdf_path) + num_pages = pdf_document.page_count if num_pages <= 30: logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。") if mode == 'goods': @@ -57,50 +117,120 @@ def check_pdf_pages(pdf_path, mode, logger): def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header): try: - # 检查 start_page 和 end_page 是否为 None if start_page is None or end_page is None: - print("Error: start_page 或 end_page 为 None") + print("错误: start_page 或 end_page 为 None") return "" - pdf_document = PdfReader(pdf_path) - total_pages = len(pdf_document.pages) - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] - output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + # 尝试使用 PyPDF2 读取 PDF + try: + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) + get_text = lambda p: pdf_document.pages[p].extract_text() + using_pypdf2 = True + # print("使用 PyPDF2 成功读取 PDF 文件。") + except Exception as e_pypdf2: + print(f"save_extracted_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + # 尝试使用 PyMuPDF 读取 PDF + try: + pdf_document = fitz.open(pdf_path) + total_pages = pdf_document.page_count + get_text = lambda p: pdf_document.load_page(p).get_text() + using_pypdf2 = False + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return "" + + # 检查页面范围的有效性 if start_page < 0 or end_page >= total_pages or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page}") return "" + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + if output_suffix == 'notice' and start_page > 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") - before_doc = PdfWriter() toc_page = -1 - # 查找目录页 + for page_num in range(min(start_page, total_pages)): - page_text = pdf_document.pages[page_num].extract_text() - cleaned_text = clean_page_content(page_text, common_header) + try: + text = get_text(page_num) + except Exception as e_extract: + print(f"提取第 {page_num} 页文本时出错: {e_extract}") + continue + cleaned_text = clean_page_content(text, common_header) if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE): toc_page = page_num break - # 确定截取的页数 pages_to_extract = toc_page + 1 if toc_page != -1 else start_page - # 提取页面 - for page_num in range(pages_to_extract): - before_doc.add_page(pdf_document.pages[page_num]) - print(before_pdf_path) - with open(before_pdf_path, 'wb') as f_before: - before_doc.write(f_before) - output_doc = PdfWriter() - for page_num in range(start_page, end_page + 1): - output_doc.add_page(pdf_document.pages[page_num]) - with open(output_pdf_path, 'wb') as f: - output_doc.write(f) - print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") - return output_pdf_path + if using_pypdf2: + before_doc = PdfWriter() + for page_num in range(pages_to_extract): + try: + before_doc.add_page(pdf_document.pages[page_num]) + except Exception as e_page: + print(f"添加第 {page_num} 页到 before_doc 时出错: {e_page}") + continue + try: + with open(before_pdf_path, 'wb') as f_before: + before_doc.write(f_before) + print(f"目录前的页面已保存到 {before_pdf_path}") + except Exception as e_write_before: + print(f"保存目录前的页面时出错: {e_write_before}") + else: + before_doc = fitz.open() + for page_num in range(pages_to_extract): + try: + before_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) + except Exception as e_page: + print(f"添加第 {page_num} 页到 before_doc 时出错: {e_page}") + continue + try: + before_doc.save(before_pdf_path) + before_doc.close() + print(f"目录前的页面已保存到 {before_pdf_path}") + except Exception as e_write_before: + print(f"保存目录前的页面时出错: {e_write_before}") + + # 提取并保存目标页面 + if using_pypdf2: + output_doc = PdfWriter() + for page_num in range(start_page, end_page + 1): + try: + output_doc.add_page(pdf_document.pages[page_num]) + except Exception as e_page: + print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}") + continue + try: + with open(output_pdf_path, 'wb') as f: + output_doc.write(f) + print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + return output_pdf_path + except Exception as e_write: + print(f"保存截取的页面时出错: {e_write}") + return "" + else: + output_doc = fitz.open() + for page_num in range(start_page, end_page + 1): + try: + output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) + except Exception as e_page: + print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}") + continue + try: + output_doc.save(output_pdf_path) + output_doc.close() + print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + return output_pdf_path + except Exception as e_write: + print(f"保存截取的页面时出错: {e_write}") + return "" except Exception as e: - print(f"Error in save_extracted_pages: {e}") - return "" # 返回空字符串 + print(f"发生未知错误: {e}") + return "" def is_pdf_or_doc(filename): @@ -159,17 +289,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): try: # 打开PDF文件 - pdf_document = PdfReader(file_path) - total_pages = len(pdf_document.pages) - # 新增逻辑:如果总页数 <= 50,直接返回 file_path + try: + pdf_document = PdfReader(file_path) + pdf_lib = 'pypdf2' + # 获取总页数(对于 pypdf2 的 PDFDocument) + total_pages = len(pdf_document.pages) + except Exception as e: + print(f"get_invalid_file:使用 pypdf2 读取失败,错误信息: {e},切换至 fitz。") + pdf_document = fitz.open(file_path) + pdf_lib = 'fitz' + # 获取总页数(对于 fitz 的 PDFDocument) + total_pages = pdf_document.page_count if total_pages <= 50: print(f"PDF页数({total_pages}) <= 50,直接返回文件路径:{file_path}") return file_path, 0 + get_text = create_get_text_function(pdf_lib, pdf_document) # 提取并清理每页的文本内容 page_texts = [] for i in range(total_pages): - page = pdf_document.pages[i] - text = page.extract_text() + text = get_text(i) cleaned_text = clean_page_content(text, common_header) if text else "" page_texts.append(cleaned_text) @@ -254,44 +392,65 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): return "", 0 -def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, +def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, output_suffix="normal", invalid_endpage=-1): start_page = None end_page = None flag = True - # 确定遍历范围 - total_pages = len(pdf_document.pages) + + try: + # 尝试使用 PyPDF2 读取 PDF + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) + get_text = create_get_text_function('pypdf2', pdf_document) + # print("使用 PyPDF2 成功读取 PDF 文件。") + except Exception as e_pypdf2: + print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + try: + # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF + pdf_document = fitz.open(pdf_path) + total_pages = pdf_document.page_count + get_text = create_get_text_function('fitz', pdf_document) + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return None, None # 或者根据需求抛出异常 end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages) - for i, page in enumerate(pdf_document.pages[begin_page:end_limit], start=begin_page): - text = page.extract_text() or "" - cleaned_text = clean_page_content(text, common_header) - if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码 - catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) - if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,cleaned_text): - flag = False - continue - if begin_page == 0 and catalog_pattern.search(cleaned_text): - continue # 如果存在目录,跳过当前页面 - else: - # 一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern - if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text): - flag = False - continue - if start_page is None and regex.search(begin_pattern, cleaned_text): - if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): - start_page = i - continue - if start_page is not None: - if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: # 使用列表判断多个匹配项 - # 判断 end_pattern 是否匹配且当前页大于起始页 - if regex.search(end_pattern, cleaned_text) and i > start_page: - end_page = i - break + for i in range(begin_page, end_limit): + try: + text = get_text(i) + except Exception as e_extract: + print(f"提取第 {i} 页文本时出错: {e_extract}") + continue # 跳过出错的页面 + if text: + cleaned_text = clean_page_content(text, common_header) + if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码 + catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) + if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,cleaned_text): + flag = False + continue + if begin_page == 0 and catalog_pattern.search(cleaned_text): + continue # 如果存在目录,跳过当前页面 else: - # 如果 end_pattern 匹配且 begin_pattern 不匹配 - if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text): - end_page = i - break + # 一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern + if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text): + flag = False + continue + if start_page is None and regex.search(begin_pattern, cleaned_text): + if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): + start_page = i + continue + if start_page is not None: + if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: # 使用列表判断多个匹配项 + # 判断 end_pattern 是否匹配且当前页大于起始页 + if regex.search(end_pattern, cleaned_text) and i > start_page: + end_page = i + break + else: + # 如果 end_pattern 匹配且 begin_pattern 不匹配 + if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text): + end_page = i + break return start_page, end_page @@ -309,7 +468,7 @@ def generate_mid_pattern(chapter_type=None): # 定义基础的 mid_pattern base_mid_pattern = ( r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' - r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)|' + r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|' r'(?start_page>begin_page - start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header, + start_page2, end_page2 = extract_pages_generic(pdf_path, second_begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) if start_page2 is None or end_page2 is None: @@ -563,8 +740,7 @@ def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpag exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) output_suffix = 'notice' - pdf_document = PdfReader(pdf_path) - start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, + start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix, invalid_endpage) if start_page is None or end_page is None: print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index 07b478f..e1e82b1 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -338,7 +338,32 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection): return parsed except Exception as e: return {"error": "调用大模型失败"} +def check_consecutive_chinese_numerals(data_dict): + """ + 检查字典的键名中是否存在连续三个中文数字键名。此时通常视为clause1提取失败,直接调用大模型更可靠 + 返回: + bool: 如果存在连续三个中文数字键名,返回 False;否则返回 True。 + """ + # 定义中文数字集合 + chinese_numerals = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "十一", "十二", "十三"} + # 获取字典的键列表,保持插入顺序 + keys = list(data_dict.keys()) + + # 初始化计数器 + count = 0 + + # 遍历所有键名 + for key in keys: + if key in chinese_numerals: + count += 1 + if count >= 3: + print("有三个及以上连续中文数字!") + return False + else: + count = 0 # 重置计数器 + + return True # 没有找到连续三个中文数字键名 # 读取JSON数据,提取内容,转换结构,并打印结果 def extract_from_notice(merged_baseinfo_path, clause_path, type): """ @@ -377,7 +402,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type): if clause_path and clause_path.strip(): with open(clause_path, 'r', encoding='utf-8') as file: data = json.load(file) - if len(data) >= 60: #默认clause中少于60条视为json提取失败! + if len(data) >= 60 and check_consecutive_chinese_numerals(data): #默认clause中少于60条视为json提取失败! # 尝试使用大章节筛选 extracted_data = extract_between_sections(data, target_values,flag) if extracted_data: @@ -406,7 +431,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type): if __name__ == "__main__": # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json' merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf" - clause_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\clause1.json" + clause_path=r"D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\clause1.json" try: res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景 res2 = json.dumps(res, ensure_ascii=False, indent=4) diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index df241a6..f659e81 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -481,11 +481,11 @@ def convert_clause_to_json(file_path, output_folder, type=1): if __name__ == "__main__": - file_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf' + file_path = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\ztbfile_tobidders_notice_part2.pdf' # file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf' # file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf' - output_folder = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444' + output_folder = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp' try: output_path = convert_clause_to_json(file_path, output_folder) print(f"Final JSON result saved to: {output_path}") diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index b6e3225..4bac2cb 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -117,7 +117,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path): if __name__ == '__main__': # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' - pdf_path=r"D:\flask_project\flask_app\static\output\output1\032c44dc-230c-4757-9160-c99f2eaac790\ztbfile_invalid.pdf" + pdf_path=r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719\ztbfile.pdf" # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index 08a3598..9a16392 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -1,4 +1,5 @@ # flask_app/工程标/截取pdf工程标版.py +import fitz import regex import os import time @@ -11,10 +12,9 @@ from flask_app.general.通用功能函数 import get_global_logger def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1): try: - pdf_document = PdfReader(pdf_path) exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) - start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, + start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix, invalid_endpage) if start_page is not None and end_page is not None: return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header) @@ -30,7 +30,21 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l 此函数将在所有模式对都失败后调用。 """ try: - pdf_document = PdfReader(pdf_path) + try: + pdf_document = PdfReader(pdf_path) + is_pypdf2 = True + total_pages = len(pdf_document.pages) + # print("使用 PyPDF2 成功读取 PDF 文件。") + except Exception as e_pypdf2: + print(f"extract_pages_twice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + try: + pdf_document = fitz.open(pdf_path) + is_pypdf2 = False + total_pages = pdf_document.page_count + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return "" if output_suffix == "qualification": print("最后一次尝试获取qualification!") # 动态设置 include_keys @@ -49,9 +63,18 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l ) start_page = None end_page = None + def get_text(page_num): + try: + if is_pypdf2: + return pdf_document.pages[page_num].extract_text() or "" + else: + return pdf_document.load_page(page_num).get_text() or "" + except Exception as e: + print(f"提取第 {page_num} 页文本时出错: {e}") + return "" # 从指定的开始页开始遍历 - for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index): - text = page.extract_text() + for i in range(last_begin_index, total_pages): + text = get_text(i) if text: cleaned_text = clean_page_content(text, common_header) # 确定起始页,需在last_begin_index之后 diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 4286f52..4d81592 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -1,18 +1,17 @@ +import fitz from PyPDF2 import PdfReader, PdfWriter import regex # 导入正则表达式库 import os # 用于文件和文件夹操作 -from flask_app.general.clean_pdf import clean_page_content, extract_common_header +from flask_app.general.clean_pdf import clean_page_content from flask_app.general.merge_pdfs import merge_and_cleanup -from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \ - convert_to_pdf, get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice +from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice from flask_app.general.通用功能函数 import get_global_logger def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,logger,invalid_endpage=-1): try: - pdf_document = PdfReader(pdf_path) exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) - start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, + start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix,invalid_endpage) if start_page is None or end_page is None: print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") @@ -64,7 +63,8 @@ def get_patterns_for_notice(): ) return begin_pattern, end_pattern -def extract_pages_qualification(pdf_document, begin_page, common_header): +def extract_pages_qualification(pdf_path, begin_page, common_header): + pdf_document = PdfReader(pdf_path) # 开始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = regex.compile( r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)(?!的)', @@ -73,7 +73,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header): # 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头 priority_pattern = regex.compile( - r'^((资格|符合|资格性|符合性)(?:、(资格|符合|资格性|符合性))*)?(检查|审查)(?:表|比对表|对照表)?$', + r'^((资格|符合|资格性|符合性)(?:、(资格|符合|资格性|符合性))*)?(检查|审查)(?:表|比对表|对照表)?\s*$', regex.MULTILINE ) @@ -94,12 +94,32 @@ def extract_pages_qualification(pdf_document, begin_page, common_header): include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"] exclude_keywords = ["声明函", "承诺函"] + try: + # 尝试使用 PyPDF2 读取 PDF + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) + get_text = lambda page: page.extract_text() + # print("使用 PyPDF2 成功读取 PDF 文件。") + except Exception as e_pypdf2: + print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") + try: + # 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF + pdf_document = fitz.open(pdf_path) + total_pages = pdf_document.page_count + get_text = lambda page: pdf_document.load_page(page).get_text() + # print("使用 PyMuPDF 成功读取 PDF 文件。") + except Exception as e_fitz: + print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}") + return None, None # 或者根据需求抛出异常 # 从指定的开始页开始遍历 - for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): - text = page.extract_text() + for i in range(begin_page, total_pages): + try: + text = get_text(i) + except Exception as e_extract: + print(f"提取第 {i} 页文本时出错: {e_extract}") + continue # 跳过出错的页面 if text: cleaned_text = clean_page_content(text, common_header) - # 优先检查是否匹配优先模式 priority_match = priority_pattern.search(cleaned_text) if priority_match and start_page is None: @@ -142,7 +162,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b try: exclusion_pattern = regex.compile( r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) - pdf_document = PdfReader(pdf_path) patterns = None start_page = None end_page = None @@ -154,10 +173,10 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b elif output_suffix == "notice": patterns = [get_patterns_for_notice()] elif output_suffix == "qualification1": - start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header) + start_page, end_page = extract_pages_qualification(pdf_path, begin_page, common_header) if patterns: for pattern_pair in patterns: - start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, + start_page, end_page = extract_pages_generic(pdf_path, pattern_pair[0], pattern_pair[1], begin_page, common_header, exclusion_pattern, output_suffix) if start_page is not None and end_page is not None: break @@ -315,11 +334,11 @@ if __name__ == "__main__": logger = get_global_logger("123") # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" - pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1).pdf" + pdf_path=r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719\ztbfile.pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" - output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444" + output_folder = r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" - selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path + selection = 3 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) print(generated_files) From c7ba7a5c939e10a4949f0b330b20bc83dbc10583 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 21 Jan 2025 15:53:31 +0800 Subject: [PATCH 2/2] =?UTF-8?q?1.20=20pypdf2=E8=AF=BB=E5=8F=96=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E5=8F=AF=E8=BD=AC=E4=B8=BA=E4=BD=BF=E7=94=A8fitz?= =?UTF-8?q?=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/clean_pdf.py | 2 +- flask_app/general/merge_pdfs.py | 85 +++++-- flask_app/general/截取pdf_main.py | 4 +- flask_app/general/截取pdf通用函数.py | 26 +- flask_app/general/读取文件/按页读取pdf.py | 2 +- flask_app/工程标/截取pdf工程标版.py | 2 +- flask_app/货物标/截取pdf货物标版.py | 21 +- flask_app/货物标/技术参数要求提取.py | 278 +++++++++++----------- 8 files changed, 236 insertions(+), 184 deletions(-) diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py index f6c1b2f..25beba0 100644 --- a/flask_app/general/clean_pdf.py +++ b/flask_app/general/clean_pdf.py @@ -121,7 +121,7 @@ def extract_common_header(pdf_path): current_common = find_common_headers(headers) if current_common: common_headers = current_common - print(f"使用策略{idx + 1}找到共同的页眉: {common_headers}") + # print(f"使用策略{idx + 1}找到共同的页眉: {common_headers}") break # 找到共同部分后退出 # 如果没有找到,继续下一个策略 diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index fe60fcb..f37aa4e 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -1,53 +1,106 @@ import os +import fitz from PyPDF2 import PdfReader, PdfWriter + +from flask_app.general.截取pdf通用函数 import create_get_text_function + + #合并PDF +# 主合并函数,尝试使用 PyPDF2,若失败则使用 fitz def merge_pdfs(paths, output_path): - # 检查是否所有路径都是空字符串或仅包含空白字符 + # 检查所有路径是否为空或仅包含空白字符 if not any(path.strip() for path in paths): return "" + # 优先尝试 PyPDF2 合并 + try: + return merge_with_pypdf2(paths, output_path) + except Exception as e: + print(f"使用 PyPDF2 合并失败,尝试使用 fitz。错误: {e}") + # 若失败则尝试 fitz + try: + return merge_with_fitz(paths, output_path) + except Exception as ex: + print(f"使用 fitz 合并也失败了。错误: {ex}") + return "" + +def merge_with_pypdf2(paths, output_path): pdf_writer = PdfWriter() - last_page_text = None # 用于存储上一个PDF的最后一页的文本 + last_page_text = None # 存储上一个 PDF 的最后一页文本 for path in paths: - # 跳过空字符串或仅包含空白字符的路径 if not path.strip(): continue try: pdf_reader = PdfReader(path) pages = pdf_reader.pages - start_index = 0 # 从第一页开始添加 + get_text = create_get_text_function('pypdf2', pdf_reader) - # 如果这不是第一个文件,并且有上一个文件的最后一页文本 + start_index = 0 if last_page_text is not None and len(pages) > 0: - current_first_page_text = pages[0].extract_text() or "" - # 比较当前文件的第一页和上一个文件的最后一页的文本 + current_first_page_text = get_text(0) if current_first_page_text == last_page_text: - start_index = 1 # 如果相同,跳过当前文件的第一页 + start_index = 1 - # 添加当前PDF的页面到写入器 for page_num in range(start_index, len(pages)): pdf_writer.add_page(pages[page_num]) - # 更新 last_page_text 为当前PDF的最后一页的文本 if len(pages) > 0: - last_page_text = pages[-1].extract_text() or "" + last_page_text = get_text(len(pages) - 1) except Exception as e: - print(f"文件 '{path}' 无法处理,错误: {e}") + print(f"文件 '{path}' 无法使用 PyPDF2 处理,错误: {e}") continue - # 如果没有添加任何页面,返回空字符串 if len(pdf_writer.pages) == 0: return "" - # 写入合并后的PDF到文件 + with open(output_path, 'wb') as out: + pdf_writer.write(out) + return output_path + +def merge_with_fitz(paths, output_path): + # 使用 fitz 创建新文档 + merged_pdf = fitz.open() + last_page_text = None + + for path in paths: + if not path.strip(): + continue + try: + pdf_doc = fitz.open(path) + get_text = create_get_text_function('fitz', pdf_doc) + page_count = pdf_doc.page_count + + start_index = 0 + if last_page_text is not None and page_count > 0: + current_first_page_text = get_text(0) + if current_first_page_text == last_page_text: + start_index = 1 + + # 插入页面到新文档 + for page_num in range(start_index, page_count): + merged_pdf.insert_pdf(pdf_doc, from_page=page_num, to_page=page_num) + + if page_count > 0: + last_page_text = get_text(page_count - 1) + + pdf_doc.close() + except Exception as e: + print(f"文件 '{path}' 无法使用 fitz 处理,错误: {e}") + continue + + if merged_pdf.page_count == 0: + merged_pdf.close() + return "" + try: - with open(output_path, 'wb') as out: - pdf_writer.write(out) + merged_pdf.save(output_path) + merged_pdf.close() return output_path except Exception as e: print(f"无法写入输出文件,错误: {e}") + merged_pdf.close() return "" def judge_file_exist(original_path, new_suffix): diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py index e2016b6..332d5d2 100644 --- a/flask_app/general/截取pdf_main.py +++ b/flask_app/general/截取pdf_main.py @@ -119,10 +119,10 @@ if __name__ == "__main__": # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" - pdf_path=r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719\ztbfile.pdf" + pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" - output_folder = r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719" + output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp" # selections = [1, 4] # 仅处理 selection 4、1 # selections = [1, 2, 3, 5] # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 294cdaa..d81ab00 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -76,7 +76,7 @@ def get_start_and_common_header(input_path, end_page): cleaned_text = clean_page_content(text, common_header) # 检查是否存在"目录" if catalog_pattern.search(cleaned_text): - print(f"页面 {i} 包含目录,跳过。") + # print(f"页面 {i} 包含目录,跳过。") continue # 如果存在目录,跳过当前页面 if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 @@ -278,7 +278,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): regex.MULTILINE ), regex.compile( - r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*", + r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$", regex.MULTILINE ), ] @@ -332,7 +332,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): for i in range(30, total_pages): text = page_texts[i] exclusion_pattern = regex.compile( - r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE) + r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE) if regex.search(exclusion_pattern,text): continue if regex.search(pattern, text): @@ -348,8 +348,11 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): return i # 如果没有匹配到,设置end_page逻辑 - if total_pages > 100: - print("未找到结束模式,总页数大于100,设置结束页为前100页。" + file_path) + if total_pages > 200: + print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {end_page}。" + file_path) + return max(100, int(total_pages * 1 / 2)) + elif 100 < total_pages <= 200: + print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。" + file_path) return max(100, int(total_pages * 2 / 3)) else: print("未找到结束模式,设置结束页为最后一页。" + file_path) @@ -417,11 +420,7 @@ def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, comm return None, None # 或者根据需求抛出异常 end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages) for i in range(begin_page, end_limit): - try: - text = get_text(i) - except Exception as e_extract: - print(f"提取第 {i} 页文本时出错: {e_extract}") - continue # 跳过出错的页面 + text = get_text(i) if text: cleaned_text = clean_page_content(text, common_header) if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码 @@ -641,7 +640,8 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h # 第一次提取尝试,使用初始的 begin_pattern begin_pattern = regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|' - r'(?max_tokens: - questions_to_continue.append((question, message)) - else: - temp_final.update(parsed) - # 第二步:多线程处理需要调用 `continue_answer` 的问题 - if questions_to_continue: - continued_results = process_continue_answers(questions_to_continue, model_type, file_id) - temp_final.update(continued_results) - - """根据所有键是否已添加处理技术要求""" - # 更新原始采购需求字典 - final_res=combine_and_update_results(modified_data, temp_final) - ffinal_res=all_postprocess(final_res) - ffinal_res["货物列表"] = good_list - # 输出最终的 JSON 字符串 - return {"采购需求":ffinal_res} +# cleaned_res = clean_json_string(model_res) #转字典 +# if not cleaned_res: +# print("本项目没有采购需求!!!") +# return {"采购需求": {}} +# preprocessed_data=preprocess_data(cleaned_res) #确保最内层为[] +# processed_data=truncate_system_keys(preprocessed_data) #限制深度 +# key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list:交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' ,输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'} +# modified_data=rename_keys(data_copy) +# user_query_template = """请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★或其他特殊符号(若有)和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。 +# **重要限制**: +# - **仅提取技术参数或采购要求,不包括任何商务要求**。商务要求通常涉及供应商资格、报价条款、交货时间、质保等内容,是整体的要求;而技术参数或采购要求则具体描述产品的技术规格、功能、性能指标等。 +# - **商务要求的关键词示例**(仅供参考,不限于此):报价、交货、合同、资质、认证、服务、保修期等。如果内容包含上述关键词,请仔细甄别是否属于商务要求。 +# +# 要求与指南: +# 1. 你的键值应该全面,不要遗漏。 +# -a.若技术参数或采购要求在表格中,那么单元格内的内容基本都要涵盖 +# -对于单元格内以序号分隔的各条参数要求,应逐条提取,并分别作为键值中的字符串列表项。 +# -对于无序号标明且在同一单元格内的参数要求或功能说明,也要根据语义分别添加进键值中。 +# -b.若技术参数或采购要求在正文部分,应准确定位到与目标货物(设备、系统、功能模块)相关的内容,将其后的技术参数或采购要求或功能说明完整提取,逐一添加到键值的字符串列表中,不得擅自添加或修改序号。 +# 2. 如果存在嵌套结构,且原文为Markdown 的表格语法,如'摄像机|有效像素|≥900W像素', 请不要返回该Markdown语法,而是使用冒号':'将相关信息拼接在一起,生成一条完整且清晰的技术参数(或采购要求)描述,作为列表中的一个字符串。如"摄像机:有效像素:≥900W像素"。 +# 3. 字符串中的内容为具体的技术参数要求或采购要求,请不要返回诸如'(1)高清录像功能'这种标题性质且不能体现要求的内容。 +# 4. 如果该货物没有相关采购要求或技术参数要求,键值应为空列表[]。 +# +# ### 示例输出1如下: +# {{ +# "摄像机控制键盘": [ +# "1、▲支持串行 RS232/RS422 和 IP 混合控制,允许在一个控制器上使用 RS232/RS422/IP 控制单个系统中的摄像机;", +# "2、支持 2 组 RS422 串口 VISCA 协议菊花链控制 2x7 台摄像机。", +# "★能够自动对焦,提供检测报告" +# ] +# }} +# +# ### 示例输出2如下(包含嵌套结构): +# {{ +# "摄像机": [ +# "摄像机:有效像素:≥900W像素", +# "摄像机:最低照度:彩色≤0.001lx", +# "协议:routes 接口开放:具备;▲支持标准 ONVIF 协议与第三方厂家设备进行互联;支持 GB/T28181;应提供 SDK" +# ] +# }} +# +# {} +# """ +# user_query_template_two="""请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。由于该货物存在 {} 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为\"{}-1\";键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★或其他特殊符号(若有)和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。 +# +# 要求与指南: +# 1. 你的键值应该全面,不要遗漏。 +# -a.若技术参数或采购要求在表格中,那么单元格内的内容基本都要涵盖 +# -对于单元格内以序号分隔的各条参数要求,应逐条提取,并分别作为键值中的字符串列表项。 +# -对于无序号标明且在同一单元格内的参数要求或功能说明,也要根据语义分别添加进键值中。 +# -b.若技术参数或采购要求在正文部分,应准确定位到与目标货物(设备、系统、功能模块)相关的内容,将其后的技术参数或采购要求或功能说明完整提取,逐一添加到键值的字符串列表中,不得擅自添加或修改序号。 +# 2. 如果存在嵌套结构,且原文为Markdown 的表格语法,如'摄像机|有效像素|≥900W像素', 请不要返回该Markdown语法,而是使用冒号':'将相关信息拼接在一起,生成一条完整且清晰的技术参数(或采购要求)描述,作为列表中的一个字符串。如"摄像机:有效像素:≥900W像素"。 +# 3. 字符串中的内容为具体的技术参数要求或采购要求,请不要返回诸如'(1)高清录像功能'这种标题性质且不能体现要求的内容。 +# 4. 如果该货物没有相关采购要求或技术参数要求,键值应为空列表[]。 +# +# ### 示例输出1如下: +# {{ +# "交换机-1": [ +# "★1、支持固化千兆电口≥8 个,固化千兆光口≥2 个,桌面型设备;", +# "2、支持静态链路聚合" +# ], +# "交换机-2": [ +# "1、交换容量≥52Gbps,包转发率≥38.69Mpps,", +# "2、提供国家强制性产品认证证书及测试报告(3C)", +# "★能实现信号控制独立传输" +# ] +# }} +# +# ### 示例输出2如下(包含嵌套结构): +# {{ +# "摄像机-1": [ +# "摄像机:有效像素:≥900W像素", +# "摄像机:最低照度:彩色≤0.001lx", +# "协议:routes 接口开放:具备;▲支持标准 ONVIF 协议与第三方厂家设备进行互联;支持 GB/T28181;应提供 SDK" +# ], +# "摄像机-2": [ +# "支持夜视", "支持云存储" +# ] +# }} +# +# {} +# """ +# queries = [] +# for key in key_paths: +# # 将键中的 '.' 替换为 '下的' +# modified_key = key.replace('.', '下的') +# # 使用修改后的键填充第一个占位符,原始键填充第二个占位符 +# if model_type==1: +# new_query = user_query_template.format(modified_key, key, modified_key,f"文件内容:{full_text}") #转豆包后取消注释 +# else: +# new_query = user_query_template.format(modified_key, key, modified_key,"") +# queries.append(new_query) +# +# # 处理 grouped_paths 中的项,应用 user_query_template_two +# for grouped_dict in grouped_paths: +# for grouped_key, grouped_key_cnt in grouped_dict.items(): +# # 将键中的 '.' 替换为 '下的' +# modified_grouped_key = grouped_key.replace('.', '下的') +# if model_type==1: +# new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key, +# modified_grouped_key, f"文件内容:{full_text}") +# else: +# new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key, +# modified_grouped_key, "") +# queries.append(new_query) +# if model_type==1: +# results = multi_threading(queries, "", "", 3,True) # 豆包 +# else: +# results = multi_threading(queries, "", file_id, 2,True) # qianwen-long +# temp_final={} +# if not results: +# print("errror!未获得大模型的回答!") +# else: +# # 第一步:收集需要调用 `continue_answer` 的问题和解析结果 +# questions_to_continue = [] # 存储需要调用 continue_answer 的 (question, parsed) +# max_tokens=3900 if model_type==1 else 5900 +# for question, response in results: +# message=response[0] +# parsed = clean_json_string(message) +# total_tokens=response[1] +# if not parsed and total_tokens>max_tokens: +# questions_to_continue.append((question, message)) +# else: +# temp_final.update(parsed) +# # 第二步:多线程处理需要调用 `continue_answer` 的问题 +# if questions_to_continue: +# continued_results = process_continue_answers(questions_to_continue, model_type, file_id) +# temp_final.update(continued_results) +# +# """根据所有键是否已添加处理技术要求""" +# # 更新原始采购需求字典 +# final_res=combine_and_update_results(modified_data, temp_final) +# ffinal_res=all_postprocess(final_res) +# ffinal_res["货物列表"] = good_list +# # 输出最终的 JSON 字符串 +# return {"采购需求":ffinal_res} def test_all_files_in_folder(input_folder, output_folder): # 确保输出文件夹存在 @@ -669,11 +669,11 @@ if __name__ == "__main__": start_time=time.time() # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf" # file_id = upload_file(truncate_file) - truncate_file=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf' + truncate_file=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目_procurement.pdf' invalid_path=r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\invalid_del.docx" # file_id=upload_file(truncate_file) - # processed_filepath = convert_file_to_markdown(truncate_file) - processed_filepath=r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\extract1.txt" + processed_filepath = convert_file_to_markdown(truncate_file) + # processed_filepath=r"D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\extract1.txt" res=get_technical_requirements(invalid_path,processed_filepath,1) json_string = json.dumps(res, ensure_ascii=False, indent=4) print(json_string)