diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index d485944..32ce61c 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -34,7 +34,7 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, end_page = i break else: - if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern,cleaned_text): + if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text): end_page = i break return start_page, end_page @@ -99,7 +99,7 @@ def get_patterns_for_procurement(): r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*(?!.*说明)' # 匹配“第X章”或“第X部分” r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符 r'(?:(?:服务|项目|商务|技术|供货)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求” - r'(?:采购.*?(?:内容|要求|需求)?|招标(?:内容|要求|需求))[\u4e00-\u9fff、()()]*?|' + r'(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求))[\u4e00-\u9fff、()()]*?|' r'需求书[\u4e00-\u9fff、()()]*?)\s*$', regex.MULTILINE ) @@ -525,7 +525,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_ elif selection == 5: begin_pattern = regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|' - r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)?|招标(?:内容|要求|需求)).*|' + r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书' ) end_pattern = regex.compile(