diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index b58b21b..29ec264 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -259,6 +259,246 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page): print(f"处理文件 {file_path} 时发生错误: {e}") return "" +def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, + output_suffix="normal"): + start_page = None + end_page = None + flag=True + for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): + text = page.extract_text() or "" + cleaned_text = clean_page_content(text, common_header) + if output_suffix == "tobidders_notice": + catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) + if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text): + flag=False + continue + if begin_page==0 and catalog_pattern.search(cleaned_text): + continue # 如果存在目录,跳过当前页面 + else: + #一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern + if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text): + flag=False + continue + if start_page is None and regex.search(begin_pattern, cleaned_text): + if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): + start_page = i + continue + if start_page is not None: + if output_suffix == "tobidders_notice": + if regex.search(end_pattern, cleaned_text) and i > start_page: + end_page = i + break + else: + if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text): + end_page = i + break + return start_page, end_page + +def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): + """ + 从PDF文档中提取起始页、中间页和结束页。 + + 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。 + + 参数: + pdf_document (PDFDocument): 要处理的PDF文档对象。 + begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 + begin_page (int): 开始搜索的页码。 + common_header (str): 每页需要清理的公共头部文本。 + exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。 + + 返回: + tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None) + """ + + def run_extraction(local_begin_pattern, local_end_pattern=None): + """ + 使用提供的 begin 和 end 模式运行提取过程。 + + 如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。 + + 参数: + local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 + local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。 + + 返回: + tuple: (start_page, mid_page, end_page) + """ + start_page = None + mid_page = None + end_page = None + combined_mid_pattern = None # 中间页的组合模式 + pdf_document = PdfReader(pdf_path) + catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) + for i, page in enumerate(pdf_document.pages): + text = page.extract_text() or "" + cleaned_text = clean_page_content(text, common_header) + + # 如果已经找到中间页,且当前页匹配排除模式,则跳过 + if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None: + continue + if begin_page==0 and catalog_pattern.search(cleaned_text): + continue # 如果存在目录,跳过当前页面 + + # 识别起始页 + if start_page is None: + match = regex.search(local_begin_pattern, cleaned_text) + if match and i > begin_page: + start_page = i + matched_text:str = match.group(0) # 获取整个匹配的文本 + # 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成 + if not local_end_pattern: + if '章' in matched_text: + chapter_type = '章' + elif '部分' in matched_text: + chapter_type = '部分' + else: + chapter_type = None # 未匹配到“章”或“部分” + + if chapter_type: + # 根据 chapter_type 动态生成 end_pattern + end_pattern_dynamic = regex.compile( + rf'^(? mid_page: + end_page = i + break + + return start_page, mid_page, end_page + + # 第一次提取尝试,使用初始的 begin_pattern + start_page, mid_page, end_page = run_extraction(begin_pattern) + + # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 + if not (start_page and mid_page and end_page): + print(f"第二次尝试 tobidders_notice!") + pdf_document = PdfReader(pdf_path) + start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page) + if start_page and end_page and mid_page: + return start_page, mid_page, end_page + else: + # 定义新的 begin_pattern 和 end_pattern + new_begin_pattern = regex.compile( + r'.*(?:投标人|磋商|谈判|供应商|应答人)须知\s*$|' + r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人)须知前附表', + regex.MULTILINE + ) + new_end_pattern = regex.compile( + r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', + regex.MULTILINE + ) + print("第三次尝试 tobidders_notice! ") + # 第二次提取尝试,使用新的模式 + start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) + + return start_page, mid_page, end_page + + +def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): + output_suffix = "tobidders_notice" + begin_pattern = regex.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人)须知)+', + regex.MULTILINE + ) + end_pattern = regex.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分 + ) + exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词 + + exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') + + + # 提取第一部分 + start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix) + if start_page1 is None or end_page1 is None: + return "", "", "" + + # 提取第二部分 + start_page2 = end_page1 + + # 检查end_page1页面的内容 + text = pdf_document.pages[end_page1].extract_text() or "" + cleaned_text = clean_page_content(text, common_header) + match = end_pattern.search(cleaned_text) + + if match: + # 获取匹配到的中文部分 + chapter_title = match.group(1) + # 检查是否包含排除关键词 + if any(word in chapter_title for word in exclusion_words): + # 如果包含排除关键词,直接返回相同的路径 + return start_page1, end_page1, end_page1 + + # 如果不包含排除关键词,继续提取第二部分 + _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header, + exclusion_pattern, output_suffix) + + if end_page2 is None: + return start_page1, end_page1, "" + + return start_page1, end_page1, end_page2 + if __name__ == "__main__": file_path=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf' output_folder=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp' diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index f9eeede..a354335 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -139,25 +139,18 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter for i in range(len(pdf_document.pages)): page = pdf_document.pages[i] text = page.extract_text() - if text: - cleaned_text = clean_page_content(text, common_header) - if flag and regex.search(exclusion_pattern, cleaned_text): - flag = False + cleaned_text = clean_page_content(text, common_header) + if flag and regex.search(exclusion_pattern, cleaned_text): + flag = False + continue + if start_page is None and regex.search(begin_pattern, cleaned_text): + if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): + start_page = i continue - if regex.search(begin_pattern, cleaned_text) and i >= begin_page: - if start_page and (output_suffix == "notice" or output_suffix == "invalid"): - pass - else: - start_page = i - if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, - cleaned_text): - condition = i > start_page - if condition: - is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页 - if is_invalid_condition or output_suffix != "invalid": - end_page = i - break # 找到结束页后退出循环 - + if start_page is not None: + if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text): + end_page = i + break # 移除对 extract_pages_twice 的调用 if start_page is None or end_page is None: print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!") diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 6540651..70297d6 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -4,51 +4,14 @@ import os # 用于文件和文件夹操作 from flask_app.general.clean_pdf import clean_page_content, extract_common_header from flask_app.general.merge_pdfs import merge_and_cleanup from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \ - convert_to_pdf, get_invalid_file + convert_to_pdf, get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic from flask_app.general.通用功能函数 import get_global_logger -def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, - output_suffix="normal"): - start_page = None - end_page = None - flag=True - for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): - text = page.extract_text() or "" - cleaned_text = clean_page_content(text, common_header) - if output_suffix == "tobidders_notice": - catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) - if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text): - flag=False - continue - if begin_page==0 and catalog_pattern.search(cleaned_text): - continue # 如果存在目录,跳过当前页面 - else: - #一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern - if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text): - flag=False - continue - if start_page is None and regex.search(begin_pattern, cleaned_text): - if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): - start_page = i - continue - if start_page is not None: - if output_suffix == "tobidders_notice": - if regex.search(end_pattern, cleaned_text) and i > start_page: - end_page = i - break - else: - if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text): - end_page = i - break - return start_page, end_page - - def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix,logger): try: common_header = extract_common_header(pdf_path) pdf_document = PdfReader(pdf_path) exclusion_pattern = None - total_pages = len(pdf_document.pages) - 1 # 获取总页数 if output_suffix == "tobidders_notice": exclusion_pattern = regex.compile( @@ -59,9 +22,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter if not start_page or not mid_page or not end_page: print(f"三次提取tobidders_notice均失败!!{pdf_path}") return "", "" - # if start_page is None or end_page is None or mid_page is None: - # print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") - # return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page) path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header) if mid_page != end_page: path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header) @@ -76,16 +36,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) - # 针对 selection = 6 的特殊处理 - if output_suffix == "format": - if start_page is None: - print(f"{output_suffix}: 未找到起始页,提取失败!") - return "" - if end_page is None: - # 如果未匹配到结束页,默认截取到文件末尾 - end_page = total_pages - print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。") - return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header) if start_page is None or end_page is None: print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger) @@ -136,212 +86,6 @@ def get_patterns_for_notice(): ) return begin_pattern, end_pattern -def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): - """ - 从PDF文档中提取起始页、中间页和结束页。 - - 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。 - - 参数: - pdf_document (PDFDocument): 要处理的PDF文档对象。 - begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 - begin_page (int): 开始搜索的页码。 - common_header (str): 每页需要清理的公共头部文本。 - exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。 - - 返回: - tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None) - """ - - def run_extraction(local_begin_pattern, local_end_pattern=None): - """ - 使用提供的 begin 和 end 模式运行提取过程。 - - 如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。 - - 参数: - local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 - local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。 - - 返回: - tuple: (start_page, mid_page, end_page) - """ - start_page = None - mid_page = None - end_page = None - combined_mid_pattern = None # 中间页的组合模式 - pdf_document = PdfReader(pdf_path) - catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE) - for i, page in enumerate(pdf_document.pages): - text = page.extract_text() or "" - cleaned_text = clean_page_content(text, common_header) - - # 如果已经找到中间页,且当前页匹配排除模式,则跳过 - if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None: - continue - if begin_page==0 and catalog_pattern.search(cleaned_text): - continue # 如果存在目录,跳过当前页面 - - # 识别起始页 - if start_page is None: - match = regex.search(local_begin_pattern, cleaned_text) - if match and i > begin_page: - start_page = i - matched_text:str = match.group(0) # 获取整个匹配的文本 - # 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成 - if not local_end_pattern: - if '章' in matched_text: - chapter_type = '章' - elif '部分' in matched_text: - chapter_type = '部分' - else: - chapter_type = None # 未匹配到“章”或“部分” - - if chapter_type: - # 根据 chapter_type 动态生成 end_pattern - end_pattern_dynamic = regex.compile( - rf'^(? mid_page: - end_page = i - break - - return start_page, mid_page, end_page - - # 第一次提取尝试,使用初始的 begin_pattern - start_page, mid_page, end_page = run_extraction(begin_pattern) - - # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 - if not (start_page and mid_page and end_page): - print(f"第二次尝试 tobidders_notice!") - pdf_document = PdfReader(pdf_path) - start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page) - if start_page and end_page and mid_page: - return start_page, mid_page, end_page - else: - # 定义新的 begin_pattern 和 end_pattern - new_begin_pattern = regex.compile( - r'.*(?:投标人|磋商|谈判|供应商|应答人)须知\s*$|' - r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人)须知前附表', - regex.MULTILINE - ) - new_end_pattern = regex.compile( - r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', - regex.MULTILINE - ) - print("第三次尝试 tobidders_notice! ") - # 第二次提取尝试,使用新的模式 - start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) - - return start_page, mid_page, end_page - - -def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): - output_suffix = "tobidders_notice" - begin_pattern = regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人)须知)+', - regex.MULTILINE - ) - end_pattern = regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分 - ) - exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词 - - exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') - - - # 提取第一部分 - start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix) - if start_page1 is None or end_page1 is None: - return "", "", "" - - # 提取第二部分 - start_page2 = end_page1 - - # 检查end_page1页面的内容 - text = pdf_document.pages[end_page1].extract_text() or "" - cleaned_text = clean_page_content(text, common_header) - match = end_pattern.search(cleaned_text) - - if match: - # 获取匹配到的中文部分 - chapter_title = match.group(1) - # 检查是否包含排除关键词 - if any(word in chapter_title for word in exclusion_words): - # 如果包含排除关键词,直接返回相同的路径 - return start_page1, end_page1, end_page1 - - # 如果不包含排除关键词,继续提取第二部分 - _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header, - exclusion_pattern, output_suffix) - - if end_page2 is None: - return start_page1, end_page1, "" - - return start_page1, end_page1, end_page2 - - def extract_pages_qualification(pdf_document, begin_page, common_header): # 开始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = regex.compile(