import logging from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 from flask_app.general.clean_pdf import clean_page_content, extract_common_header from flask_app.general.format_change import docx2pdf from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods import concurrent.futures def get_global_logger(unique_id): if unique_id is None: return logging.getLogger() # 获取默认的日志器 logger = logging.getLogger(unique_id) return logger logger = None # fitz库版本 # def extract_common_header(pdf_path): # doc = fitz.open(pdf_path) # headers = [] # total_pages = len(doc) # # if total_pages == 2: # pages_to_read = 2 # start_page = 0 # else: # pages_to_read = 3 # middle_page = total_pages // 2 # start_page = max(0, middle_page - 1) # # for i in range(start_page, min(start_page + pages_to_read, total_pages)): # page = doc[i] # text = page.get_text() # if text: # first_lines = text.strip().split('\n')[:3] # headers.append(first_lines) # # doc.close() # # if len(headers) < 2: # return "" # # common_headers = [] # for lines in zip(*headers): # common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]]) # if common_line: # common_headers.append(' '.join(common_line)) # # return '\n'.join(common_headers) def is_pdf_or_doc(filename): # 判断文件是否为PDF或Word文档 return filename.lower().endswith(('.pdf', '.doc', '.docx')) def convert_to_pdf(file_path): # 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换 if file_path.lower().endswith(('.doc', '.docx')): return docx2pdf(file_path) return file_path def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): pdf_path = convert_to_pdf(file_path) result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) if result: if output_suffix == "tobidders_notice": # 确保返回的是元组,并将其中的 None 转换为 "" path1, path2 = result return [path1 or "", path2 or ""] elif output_suffix == "qualification1": merge_and_cleanup(result, "qualification3") return [result or ""] return [result or ""] return [""] # 返回空字符串 # 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。 def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, output_suffix="normal"): start_page = None end_page = None for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) if output_suffix == "tobidders_notice": if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None: continue else: if exclusion_pattern and re.search(exclusion_pattern, cleaned_text): continue if start_page is None and re.search(begin_pattern, cleaned_text): if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): start_page = i if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: end_page = i break return start_page, end_page def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): try: common_header = extract_common_header(pdf_path) pdf_document = PdfReader(pdf_path) exclusion_pattern = None total_pages = len(pdf_document.pages) - 1 # 获取总页数 if output_suffix == "tobidders_notice": exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') start_page, mid_page, end_page = extract_pages_tobidders_notice( pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern ) if not start_page or not mid_page or not end_page: print(f"三次提取tobidders_notice均失败!!{pdf_path}") return "","" # if start_page is None or end_page is None or mid_page is None: # print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") # return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page) path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1") path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2") return path1, path2 else: # 原有的处理逻辑保持不变 if output_suffix == "qualification1": exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) # 针对 selection = 6 的特殊处理 if output_suffix == "format": if start_page is None: print(f"{output_suffix}: 未找到起始页,提取失败!") return "" if end_page is None: # 如果未匹配到结束页,默认截取到文件末尾 end_page = total_pages print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。") return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) if start_page is None or end_page is None: print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page) elif output_suffix == "qualification1": truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节 return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) except Exception as e: print(f"Error processing {pdf_path}: {e}") return "" def get_patterns_for_procurement(): # begin_pattern = re.compile( # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*', # re.MULTILINE) begin_pattern = re.compile( r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符 r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求” r'(?:采购|需求)[\u4e00-\u9fff、()()]*?)\s*$', # 或者匹配“采购”或“需求” re.MULTILINE ) end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) return begin_pattern, end_pattern def get_patterns_for_evaluation_method(): # begin_pattern = re.compile( # r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、()()]*\s*$', # re.MULTILINE # ) begin_pattern = re.compile( r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号) r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。 r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法” r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾 re.MULTILINE ) end_pattern = re.compile( r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) return begin_pattern, end_pattern def get_patterns_for_notice(): begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*') end_pattern = re.compile( # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) return begin_pattern, end_pattern def get_patterns_for_notice_twice(): begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE ) end_pattern = re.compile( # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) return begin_pattern, end_pattern # def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header, # exclusion_pattern): # start_page = None # mid_page = None # end_page = None # for i, page in enumerate(pdf_document.pages): # text = page.extract_text() or "" # cleaned_text = clean_page_content(text, common_header) # if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: # continue # if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: # start_page = i # if start_page is not None and mid_page is None and re.search( # r'^\s*[((]?\s*[一1]\s*[))]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text): # mid_page = i # if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: # end_page = i # break # return start_page, mid_page, end_page def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): """ 从PDF文档中提取起始页、中间页和结束页。 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。 参数: pdf_document (PDFDocument): 要处理的PDF文档对象。 begin_pattern (str 或 re.Pattern): 用于识别起始的正则表达式模式。 begin_page (int): 开始搜索的页码。 common_header (str): 每页需要清理的公共头部文本。 exclusion_pattern (str 或 re.Pattern): 用于排除某些页的模式。 返回: tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None) """ def run_extraction(local_begin_pattern, local_end_pattern=None): """ 使用提供的 begin 和 end 模式运行提取过程。 如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。 参数: local_begin_pattern (str 或 re.Pattern): 用于识别起始的正则表达式模式。 local_end_pattern (str 或 re.Pattern, 可选): 用于识别结束的正则表达式模式。 返回: tuple: (start_page, mid_page, end_page) """ start_page = None mid_page = None end_page = None chapter_type = None # 用于存储“章”或“部分” combined_mid_pattern = None # 中间页的组合模式 pdf_document = PdfReader(pdf_path) for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) # 如果已经找到中间页,且当前页匹配排除模式,则跳过 if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: continue # 识别起始页 if start_page is None: match = re.search(local_begin_pattern, cleaned_text) if match and i > begin_page: start_page = i matched_text = match.group(0) # 获取整个匹配的文本 # 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成 if not local_end_pattern: if '章' in matched_text: chapter_type = '章' elif '部分' in matched_text: chapter_type = '部分' else: chapter_type = None # 未匹配到“章”或“部分” if chapter_type: # 根据 chapter_type 动态生成 end_pattern end_pattern_dynamic = re.compile( rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+', re.MULTILINE ) # 根据 chapter_type 动态生成 additional_mid_pattern if chapter_type == '章': additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)' elif chapter_type == '部分': additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)' else: additional_mid_pattern = '' # 定义基础的 mid_pattern base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \ r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)' # 合并基础模式和额外模式 if additional_mid_pattern: combined_mid_pattern = re.compile( rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})', re.MULTILINE ) else: combined_mid_pattern = re.compile( rf'{base_mid_pattern}', re.MULTILINE ) else: # 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern end_pattern_dynamic = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) # 定义基础的 mid_pattern base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \ r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)' combined_mid_pattern = re.compile( rf'{base_mid_pattern}', re.MULTILINE ) else: # 如果提供了固定的 end_pattern,则使用默认的 mid_pattern base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则)\s*$' #可以匹配"东莞市粤隆招标有限公司编制 46一、说明" combined_mid_pattern = re.compile( rf'{base_mid_pattern}', re.MULTILINE ) continue # 识别中间页 if start_page is not None and mid_page is None and combined_mid_pattern: if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text): continue if re.search(combined_mid_pattern, cleaned_text): mid_page = i # 识别结束页 if start_page is not None and mid_page is not None: # 使用提供的 end_pattern 或动态生成的 end_pattern current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic if re.search(current_end_pattern, cleaned_text): if i > mid_page: end_page = i break return start_page, mid_page, end_page # 第一次提取尝试,使用初始的 begin_pattern start_page, mid_page, end_page = run_extraction(begin_pattern) # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 if not (start_page and mid_page and end_page): print(f"第二次尝试 tobidders_notice!{pdf_path}") pdf_document = PdfReader(pdf_path) start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page) if start_page and end_page and mid_page: return start_page, mid_page, end_page else: # 定义新的 begin_pattern 和 end_pattern new_begin_pattern = re.compile( r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|' r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', re.MULTILINE ) new_end_pattern = re.compile( r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE ) print("第三次尝试 tobidders_notice! ") # 第二次提取尝试,使用新的模式 start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) return start_page, mid_page, end_page #投标人须知分为两个章节 # def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page): # begin_pattern = re.compile( # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+' # ) # end_pattern = re.compile( # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分 # ) # exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 # # pdf_document = PdfReader(pdf_path) # exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') # # # 提取第一部分 # start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) # if start_page1 is None or end_page1 is None: # print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") # return "", "" # # # 保存第一部分的路径 # path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder, # "tobidders_notice_part1") # # # 提取第二部分 # start_page2 = end_page1 # # # 检查end_page1页面的内容 # text = pdf_document.pages[end_page1].extract_text() or "" # cleaned_text = clean_page_content(text, common_header) # match = end_pattern.search(cleaned_text) # # if match: # # 获取匹配到的中文部分 # chapter_title = match.group(1) # # 检查是否包含排除关键词 # if any(word in chapter_title for word in exclusion_words): # # 如果包含排除关键词,直接返回相同的路径 # return path1, path1 # # # 如果不包含排除关键词,继续提取第二部分 # _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header, # exclusion_pattern) # # if end_page2 is None: # print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!") # return path1, path1 # # # 保存第二部分的路径 # path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder, # "tobidders_notice_part2") # # return path1, path2 def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page): output_suffix="tobidders_notice" begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE ) end_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',re.MULTILINE # 捕获中文部分 ) exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') # 提取第一部分 start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) if start_page1 is None or end_page1 is None: return "", "","" # 提取第二部分 start_page2 = end_page1 # 检查end_page1页面的内容 text = pdf_document.pages[end_page1].extract_text() or "" cleaned_text = clean_page_content(text, common_header) match = end_pattern.search(cleaned_text) if match: # 获取匹配到的中文部分 chapter_title = match.group(1) # 检查是否包含排除关键词 if any(word in chapter_title for word in exclusion_words): # 如果包含排除关键词,直接返回相同的路径 return start_page1, end_page1,end_page1 # 如果不包含排除关键词,继续提取第二部分 _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2-1, common_header, exclusion_pattern,output_suffix) if end_page2 is None: return start_page1, end_page1,end_page1 return start_page1, end_page1,end_page2 def extract_pages_qualification(pdf_document, begin_page, common_header): # 开始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = re.compile( r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)', re.MULTILINE ) # 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头 priority_pattern = re.compile( r'^(资格性检查|资格审查|符合性审查)', re.MULTILINE ) # 结束匹配模式 - 附录、附件、附表等 end_pattern_attachment = re.compile( r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$', re.MULTILINE ) # 结束匹配模式 - 章节标题 end_pattern_chapter = re.compile( r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE ) print("第二次尝试:匹配附件") start_page = None end_page = None include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"] exclude_keywords = ["声明函", "承诺函"] # 从指定的开始页开始遍历 for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) # 优先检查是否匹配优先模式 priority_match = priority_pattern.search(cleaned_text) if priority_match and start_page is None: start_page = i # print(f"匹配到优先模式,设置起始页为: {start_page}") continue else: # 如果未匹配优先模式,则按照一般模式进行判断 if ( any(keyword in cleaned_text for keyword in include_keywords) and all(keyword not in cleaned_text for keyword in exclude_keywords) and start_page is None ): if begin_pattern.search(cleaned_text): start_page = i continue # print(f"匹配到附录等模式,设置起始页为: {start_page}") # 确定结束页 - 附录、附件、附表等 if start_page is not None and end_pattern_attachment.search(cleaned_text): # 额外检查当前页面是否不包含任何 include_keywords if not any(keyword in cleaned_text for keyword in include_keywords): if i > start_page: end_page = i # print(f"找到结束页 (附件类): {end_page}") break # 找到结束页后退出循环 else: print(f"当前页面匹配附件结束模式但包含 include_keywords,继续查找。页面: {i}") # 确定结束页 - 章节标题 elif start_page is not None and end_pattern_chapter.search(cleaned_text): if i > start_page: end_page = i print(f"找到结束页 (章节标题): {end_page}") break # 找到结束页后退出循环 return start_page, end_page def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page): try: exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') pdf_document = PdfReader(pdf_path) patterns = None start_page = None end_page = None if output_suffix == "procurement": patterns = [get_patterns_for_procurement()] elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3": patterns = [get_patterns_for_evaluation_method()] elif output_suffix == "notice": patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()] elif output_suffix == "qualification1": start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header) if patterns: for pattern_pair in patterns: start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, common_header,exclusion_pattern, output_suffix) if start_page is not None and end_page is not None: break if start_page is None or end_page is None: if output_suffix == "qualification1": # print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") print("第三次尝试资格审查:尝试提取评分办法章节...") temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2") if len(temp) > 0: return temp[0] else: return "" else: print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") return "" return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) except Exception as e: print(f"Error in extract_pages_twice: {e}") return "" def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): try: # 检查 start_page 和 end_page 是否为 None if start_page is None or end_page is None: print("Error: start_page 或 end_page 为 None") return "" base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page}") return "" if output_suffix == 'notice' and start_page - 1 >= 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") before_doc = PdfWriter() for page_num in range(0, start_page): before_doc.add_page(pdf_document.pages[page_num]) with open(before_pdf_path, 'wb') as f: before_doc.write(f) # print(f"已保存页面从 0 到 {start_page - 1} 为 {before_pdf_path}") output_doc = PdfWriter() for page_num in range(start_page, end_page + 1): output_doc.add_page(pdf_document.pages[page_num]) with open(output_pdf_path, 'wb') as f: output_doc.write(f) print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") return output_pdf_path except Exception as e: print(f"Error in save_extracted_pages: {e}") return "" # 返回空字符串 def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',re.MULTILINE) pdf_document = PdfReader(input_path) for i, page in enumerate(pdf_document.pages): if i > 10: return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) if begin_pattern.search(cleaned_text) and not re.search(r'目\s*录', cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header,last_begin_index return common_header,last_begin_index def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"): try: # 检查是否为文件夹 if os.path.isdir(input_path): generated_files = [] for file_name in os.listdir(input_path): file_path = os.path.join(input_path, file_name) if is_pdf_or_doc(file_path): result = process_input(file_path, output_folder, selection, output_suffix) if isinstance(result, tuple): generated_files.extend([f if f else "" for f in result]) else: generated_files.append(result) return generated_files # 单文件情况 elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): return process_input(input_path, output_folder, selection, output_suffix) else: print("提供的路径既不是文件夹也不是PDF文件。") return [''] except Exception as e: print(f"Error in truncate_pdf_main: {e}") return [''] # 返回空字符串 def process_input(input_path, output_folder, selection, output_suffix): try: # 创建输出文件夹 if not os.path.exists(output_folder): os.makedirs(output_folder) # 获取起始和通用页眉 common_header, last_begin_index = get_start_and_common_header(input_path) begin_page = last_begin_index if last_begin_index != 0 else { 4: 1, 2: 5, 3: 5, 1: 0, 5: 3 }.get(selection, 0) # 根据选择设置对应的模式和结束模式 if selection == 1: begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE) end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) local_output_suffix = "notice" elif selection == 2: begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*(磋商|谈判|评标|评定|评审|办法|方法).*(磋商|谈判|评标|评定|评审|办法|方法)') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "evaluation_method" elif selection == 3: begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE) end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) local_output_suffix = "qualification1" elif selection == 4: begin_pattern = re.compile( r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)', re.MULTILINE) end_pattern = None local_output_suffix = "tobidders_notice" elif selection == 5: begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') #包头中有一章'采购相关说明' end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "procurement" # begin_pattern = re.compile( # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' # r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE) # end_pattern = re.compile( # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) elif selection == 6: begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) local_output_suffix = "format" else: print("无效的选择:请选择1-5") return [''] # 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix if output_suffix == "default": output_suffix = local_output_suffix # 调用实际处理文件内容的函数 return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) except Exception as e: print(f"Error in process_input: {e}") return [''] def truncate_pdf_multiple(pdf_path, output_folder,unique_id="123"): global logger logger = get_global_logger(unique_id) base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] truncate_files = [] # 定义要处理的选择范围 selections = range(1, 6) # 使用 ThreadPoolExecutor 进行多线程处理 with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # 提交所有任务并保持 selection 顺序 future_to_selection = { selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections} # 按 selection 顺序收集结果 for selection in selections: future = future_to_selection.get(selection) try: files = future.result() if files: truncate_files.extend(files) except Exception as e: logger.error(f"Selection {selection} 生成了一个异常: {e}") # 定义合并后的输出路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") # 调用 merge_selected_pdfs 并获取返回值 merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name) if merged_path: # 合并成功,添加合并后的文件路径 truncate_files.append(merged_path) logger.info(f"已生成合并文件: {merged_output_path}") else: # 合并失败,添加空字符串 truncate_files.append("") logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") return truncate_files #小解析,只需要前三章内容 def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="123"): """ 处理 PDF 文件,选择指定的 selections,并合并结果。 Args: pdf_path (str): 要处理的 PDF 文件路径。 output_folder (str): 截取后的文件保存文件夹路径。 selections (iterable): 要处理的 selection 列表。 Returns: list: 截取的文件路径列表,包括合并后的文件路径(如果有)。 """ global logger logger = get_global_logger(unique_id) base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] truncate_files = [] # 使用 ThreadPoolExecutor 进行多线程处理 with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor: # 提交所有任务并保持 selection 顺序 future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections} # 按 selection 顺序收集结果 for selection in selections: future = future_to_selection.get(selection) try: files = future.result() if files: if isinstance(files, list): truncate_files.extend(files) elif isinstance(files, str): truncate_files.append(files) except Exception as e: logger.error(f"Selection {selection} 生成了一个异常: {e}") # 定义合并后的输出路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") # 调用 merge_selected_pdfs 并获取返回值 merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name) if merged_path: # 合并成功,添加合并后的文件路径 truncate_files.append(merged_path) logger.info(f"已生成合并文件: {merged_output_path}") else: # 合并失败,添加空字符串 truncate_files.append("") logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") return truncate_files # TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 #ztbfile.pdf少资格评审 包头少符合性评审 if __name__ == "__main__": # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" output_folder=r"C:\Users\Administrator\Desktop\new招标文件\output5" # files = truncate_pdf_multiple(input_path, output_folder) # selections = [1,4] # files=truncate_pdf_specific_goods(input_path,output_folder,selections) # print(files) selection = 5# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 generated_files = truncate_pdf_main(input_path, output_folder, selection) print(generated_files)