diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 2d4637e..5abe9b0 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -12,10 +12,9 @@ def extract_text_by_page(file_path): page = reader.pages[page_num] text = page.extract_text() if text: - print(text) + # print(text) cleaned_text = clean_page_content(text,common_header) - # cleaned_text=text - # print(cleaned_text) + print(cleaned_text) print("-----------------"+str(page_num)) result += cleaned_text # print(f"Page {page_num + 1} Content:\n{cleaned_text}") @@ -119,8 +118,8 @@ def save_extracted_text_to_txt(pdf_path, txt_path): if __name__ == '__main__': # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' - # file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf" - file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf' + file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件.pdf" + # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" diff --git a/flask_app/test_case/test_正则表达式.py b/flask_app/test_case/test_正则表达式.py index 12dc5cd..c9ecb3b 100644 --- a/flask_app/test_case/test_正则表达式.py +++ b/flask_app/test_case/test_正则表达式.py @@ -1,55 +1,26 @@ -import re -line_stripped="""1.采购人:陕西省某单位 -2、采购代理机构:陕西坤硕项目管理有限公司 -3、供应商:响应招标并且符合招标文件规定资格条件和参加投标竞 -争的法人、其他组织或者自然人 -""" -pure_number_match = re.match(r'^(\d+)([^.\d)()、].*)', line_stripped) # 不允许出现右括号 -if pure_number_match: - print("yes") +import regex -# 测试字符串 +begin_pattern = regex.compile( + r'(? begin_page: start_page = i matched_text = match.group(0) # 获取整个匹配的文本 @@ -105,16 +105,16 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin if chapter_type: # 根据 chapter_type 动态生成 end_pattern if not is_secondary_match: - end_pattern = re.compile( + end_pattern = regex.compile( rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|' - r'^评标办法前附表|' + r'^评标(方法|办法)前附表|' r'^附录(?:一)?[::]|' r'^附件(?:一)?[::]|' r'^附表(?:一)?[::]', - re.MULTILINE + regex.MULTILINE ) else: - end_pattern = re.compile( + end_pattern = regex.compile( rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$' ) # 根据 chapter_type 动态生成 additional_mid_pattern @@ -126,51 +126,50 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin additional_mid_pattern = '' # 定义基础的 mid_pattern - base_mid_pattern = ( - r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' - r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文部分)' - ) + base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \ + r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \ + r'|(? mid_page: end_page = i break @@ -195,22 +194,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter pdf_document = PdfReader(pdf_path) start_page = None end_page = None - exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') + exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') # 遍历文档的每一页,查找开始和结束短语的位置 for i in range(len(pdf_document.pages)): page = pdf_document.pages[i] text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - # if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" - if re.search(exclusion_pattern, cleaned_text): + # if is_secondary_match and regex.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" + if regex.search(exclusion_pattern, cleaned_text): continue - if re.search(begin_pattern, cleaned_text) and i >= begin_page: + if regex.search(begin_pattern, cleaned_text) and i >= begin_page: if start_page and (output_suffix == "notice" or output_suffix == "invalid"): pass else: start_page = i - if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text): + if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,cleaned_text): condition = i > start_page if condition: is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页 @@ -240,16 +239,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])' # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) - end_pattern_attachment = re.compile( + end_pattern_attachment = regex.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$', - re.MULTILINE + regex.MULTILINE ) # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等 - end_pattern_chapter = re.compile( + end_pattern_chapter = regex.compile( r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|' r'\s*评标(?:办法|方法)前附表\s*$|' r'投标人须知', - re.MULTILINE + regex.MULTILINE ) start_page = None end_page = None @@ -260,7 +259,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l cleaned_text = clean_page_content(text, common_header) # 确定起始页,需在last_begin_index之后 if any(key in cleaned_text for key in include_keys): - if re.search(begin_pattern, cleaned_text, re.MULTILINE): + if regex.search(begin_pattern, cleaned_text, regex.MULTILINE): if start_page is None: start_page = i # 确保起始页不小于章节的开始页码 continue @@ -305,23 +304,28 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 - begin_pattern = re.compile( - r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', - re.MULTILINE + begin_pattern = regex.compile( + r'.*(? 25: - return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0 + if i > 10: + return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) + # 检查是否存在"目录" + if catalog_pattern.search(cleaned_text): + continue # 如果存在目录,跳过当前页面 if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header, last_begin_index return common_header, last_begin_index - def truncate_pdf_main(input_path, output_folder, selection): if os.path.isdir(input_path): generated_files = [] @@ -340,19 +344,21 @@ def truncate_pdf_main(input_path, output_folder, selection): # Selection 1: 投标人须知前附表 pattern_pairs = [ ( - re.compile( - r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'), - re.compile( - r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', - re.MULTILINE) + regex.compile( + r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE), + regex.compile( + r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', + regex.MULTILINE) ), ( - re.compile( - r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', - re.MULTILINE), - re.compile( - r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', - re.MULTILINE) + regex.compile( + r'.*(?= begin_page) or (output_suffix != "notice" and i > begin_page): start_page = i + continue if start_page is not None: if output_suffix == "tobidders_notice": - if re.search(end_pattern, cleaned_text) and i > start_page: + if regex.search(end_pattern, cleaned_text) and i > start_page: end_page = i break else: - if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text): + if regex.search(end_pattern, cleaned_text) and i > start_page and not regex.search(begin_pattern,cleaned_text): end_page = i break return start_page, end_page @@ -115,8 +83,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter total_pages = len(pdf_document.pages) - 1 # 获取总页数 if output_suffix == "tobidders_notice": - exclusion_pattern = re.compile( - r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') + exclusion_pattern = regex.compile( + r'文件的构成|文件的组成|文件构成|文件组成') start_page, mid_page, end_page = extract_pages_tobidders_notice( pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern ) @@ -138,8 +106,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter else: # 原有的处理逻辑保持不变 if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method": - exclusion_pattern = re.compile( - r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') + exclusion_pattern = regex.compile( + r'文件的构成|文件的组成|文件构成|文件组成') start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) # 针对 selection = 6 的特殊处理 @@ -164,62 +132,45 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter def get_patterns_for_procurement(): - # begin_pattern = re.compile( + # begin_pattern = regex.compile( # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*', - # re.MULTILINE) - begin_pattern = re.compile( - r'(? begin_page: +# if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page: # start_page = i -# if start_page is not None and mid_page is None and re.search( +# if start_page is not None and mid_page is None and regex.search( # r'^\s*[((]?\s*[一1]\s*[))]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text): # mid_page = i -# if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: +# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page: # end_page = i # break # return start_page, mid_page, end_page @@ -248,10 +199,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h 参数: pdf_document (PDFDocument): 要处理的PDF文档对象。 - begin_pattern (str 或 re.Pattern): 用于识别起始的正则表达式模式。 + begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 begin_page (int): 开始搜索的页码。 common_header (str): 每页需要清理的公共头部文本。 - exclusion_pattern (str 或 re.Pattern): 用于排除某些页的模式。 + exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。 返回: tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None) @@ -264,8 +215,8 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h 如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。 参数: - local_begin_pattern (str 或 re.Pattern): 用于识别起始的正则表达式模式。 - local_end_pattern (str 或 re.Pattern, 可选): 用于识别结束的正则表达式模式。 + local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 + local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。 返回: tuple: (start_page, mid_page, end_page) @@ -281,12 +232,12 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h cleaned_text = clean_page_content(text, common_header) # 如果已经找到中间页,且当前页匹配排除模式,则跳过 - if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: + if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None: continue # 识别起始页 if start_page is None: - match = re.search(local_begin_pattern, cleaned_text) + match = regex.search(local_begin_pattern, cleaned_text) if match and i > begin_page: start_page = i matched_text = match.group(0) # 获取整个匹配的文本 @@ -302,9 +253,9 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h if chapter_type: # 根据 chapter_type 动态生成 end_pattern - end_pattern_dynamic = re.compile( - rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+', - re.MULTILINE + end_pattern_dynamic = regex.compile( + rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(? mid_page: end_page = i break @@ -376,21 +328,21 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 if not (start_page and mid_page and end_page): - print(f"第二次尝试 tobidders_notice!{pdf_path}") + print(f"第二次尝试 tobidders_notice!") pdf_document = PdfReader(pdf_path) start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page) if start_page and end_page and mid_page: return start_page, mid_page, end_page else: # 定义新的 begin_pattern 和 end_pattern - new_begin_pattern = re.compile( + new_begin_pattern = regex.compile( r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|' r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', - re.MULTILINE + regex.MULTILINE ) - new_end_pattern = re.compile( + new_end_pattern = regex.compile( r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', - re.MULTILINE + regex.MULTILINE ) print("第三次尝试 tobidders_notice! ") # 第二次提取尝试,使用新的模式 @@ -401,19 +353,19 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): output_suffix = "tobidders_notice" - begin_pattern = re.compile( + begin_pattern = regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+', - re.MULTILINE + regex.MULTILINE ) - end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE # 捕获中文部分 + end_pattern = regex.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分 ) exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词 - exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') + exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成') # 提取第一部分 - start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) + start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix) if start_page1 is None or end_page1 is None: return "", "", "" @@ -438,36 +390,36 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page exclusion_pattern, output_suffix) if end_page2 is None: - return start_page1, end_page1, end_page1 + return start_page1, end_page1, "" return start_page1, end_page1, end_page2 def extract_pages_qualification(pdf_document, begin_page, common_header): # 开始匹配模式,仅匹配“附录”、“附件”或“附表” - begin_pattern = re.compile( + begin_pattern = regex.compile( r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)', - re.MULTILINE + regex.MULTILINE ) # 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头 - priority_pattern = re.compile( + priority_pattern = regex.compile( r'^(资格性检查|资格审查|符合性审查)', - re.MULTILINE + regex.MULTILINE ) # 结束匹配模式 - 附录、附件、附表等 - end_pattern_attachment = re.compile( + end_pattern_attachment = regex.compile( r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$', - re.MULTILINE + regex.MULTILINE ) # 结束匹配模式 - 章节标题 - end_pattern_chapter = re.compile( - r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', - re.MULTILINE + end_pattern_chapter = regex.compile( + r'(? 0: - return temp[0] + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf") + if os.path.isfile(evaluation_method_file): + print(f"找到评分办法章节文件: {evaluation_method_file},直接返回。") + return evaluation_method_file else: - return "" + temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2") + if len(temp) > 0: + return temp[0] + else: + return "" else: print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") return "" @@ -593,20 +551,27 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo print(f"Error in save_extracted_pages: {e}") return "" # 返回空字符串 - def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 - begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', - re.MULTILINE) + begin_pattern = regex.compile( + r'.*(? 10: - return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0 + return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - if begin_pattern.search(cleaned_text) and not re.search(r'目\s*录', cleaned_text): + # 检查是否存在"目录" + if catalog_pattern.search(cleaned_text): + continue # 如果存在目录,跳过当前页面 + if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header, last_begin_index return common_header, last_begin_index @@ -658,41 +623,43 @@ def process_input(input_path, output_folder, selection, output_suffix): # 根据选择设置对应的模式和结束模式 if selection == 1: - begin_pattern = re.compile( - r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE) - end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) + begin_pattern = regex.compile( + r'.*(?