diff --git a/flask_app/general/post_processing.py b/flask_app/general/post_processing.py index 6f8c632..f6cfee3 100644 --- a/flask_app/general/post_processing.py +++ b/flask_app/general/post_processing.py @@ -245,8 +245,8 @@ def outer_post_processing(combined_data, includes,good_list): extracted_info = inner_post_processing(base_info) # 将 '基础信息' 保留在处理后的数据中 processed_data["基础信息"] = base_info - # 提取 '采购要求' 下的 '技术要求' - tech_requirements = get_nested(base_info, ["采购要求", "技术要求"], {}) + # 提取 '采购要求' 下的 '采购需求' + tech_requirements = get_nested(base_info, ["采购要求", "采购需求"], {}) if tech_requirements: procurement_reqs = extract_matching_keys(tech_requirements,good_list) else: diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index c19f3c5..b200d6b 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -96,7 +96,7 @@ def extract_text_by_page(file_path): if __name__ == '__main__': # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' - file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile_procurement.pdf' + file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 7140be3..fec4c47 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -75,8 +75,6 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte return [""] # 返回空字符串 - - # 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。 def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, output_suffix="normal"): @@ -180,23 +178,6 @@ def get_patterns_for_evaluation_method(): end_pattern = re.compile( r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) return begin_pattern, end_pattern - -def get_patterns_for_qualification(): - # begin_pattern = re.compile( - # r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]|资格性检查|资格审查表).*(?:资格|符合性)?.*$', - # re.MULTILINE - # ) - begin_pattern = re.compile( - r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?|资格性检查|资格审查表|符合性审查表)', #目前可以匹配 附录3,因为允许了'附录'匹配 - re.MULTILINE - ) - end_pattern = re.compile( - r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' - # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', #更宽松的匹配 - re.MULTILINE - ) - return begin_pattern, end_pattern def get_patterns_for_notice(): begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*') end_pattern = re.compile( @@ -489,31 +470,61 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page) return start_page1, end_page1,end_page2 -def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header): + +import re + + +def extract_pages_qualification(pdf_document, begin_page, common_header): + begin_pattern = re.compile( + r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)', + re.MULTILINE + ) + priority_pattern = re.compile( + r'^(资格性检查|资格审查|符合性审查)', + re.MULTILINE + ) + end_pattern = re.compile( + r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。 + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', + re.MULTILINE + ) + print("第二次尝试:匹配附件") start_page = None end_page = None - include_keywords = ["资格审查", "资质审查", "符合性审查"] + include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"] exclude_keywords = ["声明函", "承诺函"] + # 从章节开始后的位置进行检查 for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - # 确定起始页,需在last_begin_index之后 - if ( - any(keyword in cleaned_text for keyword in include_keywords) and - all(keyword not in cleaned_text for keyword in exclude_keywords) and - start_page is None - ): - if begin_pattern.search(cleaned_text): - print("第二次尝试:匹配附件") - start_page = i + + # 优先检查是否匹配优先模式 + priority_match = priority_pattern.search(cleaned_text) + if priority_match and start_page is None: + start_page = i + # print(f"匹配到优先模式,设置起始页为: {start_page}") + else: + # 如果未匹配优先模式,则按照一般模式进行判断 + if ( + any(keyword in cleaned_text for keyword in include_keywords) and + all(keyword not in cleaned_text for keyword in exclude_keywords) and + start_page is None + ): + if begin_pattern.search(cleaned_text): + start_page = i + print(f"匹配到附录等模式,设置起始页为: {start_page}") + # 确定结束页 if start_page is not None and end_pattern.search(cleaned_text): if i > start_page: end_page = i + # print(f"找到结束页: {end_page}") break # 找到结束页后退出循环 - return start_page,end_page + + return start_page, end_page + def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page): try: @@ -527,17 +538,13 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be patterns = [get_patterns_for_procurement()] elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3": patterns = [get_patterns_for_evaluation_method()] - elif output_suffix == "qualification1": - patterns = [get_patterns_for_qualification()] elif output_suffix == "notice": patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()] - + elif output_suffix == "qualification1": + start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header) if patterns: for pattern_pair in patterns: - if output_suffix=="qualification1": - start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header) - else: - start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, + start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, common_header,exclusion_pattern, output_suffix) if start_page is not None and end_page is not None: break @@ -803,17 +810,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 #ztbfile.pdf少资格评审 包头少符合性评审 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1).pdf" - # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf" + # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" + input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" - output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output4" + output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3" # files = truncate_pdf_multiple(input_path, output_folder) # selections = [1,4] # files=truncate_pdf_specific_goods(input_path,output_folder,selections) # print(files) - selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + selection = 3# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 generated_files = truncate_pdf_main(input_path, output_folder, selection) print(generated_files) \ No newline at end of file