1.6 insert_del_pagemark.py修改

2025-01-06 11:55:34 +08:00 · 2025-01-06 11:55:34 +08:00 · 691883cc99
commit 691883cc99
parent 04816cdedf
1 changed files with 3 additions and 3 deletions
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -34,7 +34,7 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                    end_page = i
                    break
            else:
-                if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern,cleaned_text):
+                if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
                    end_page = i
                    break
    return start_page, end_page
@ -99,7 +99,7 @@ def get_patterns_for_procurement():
        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*(?!.*说明)'  # 匹配“第X章”或“第X部分”
        r'[\u4e00-\u9fff、()（）]*?'  # 匹配允许的字符
        r'(?:(?:服务|项目|商务|技术|供货)[\u4e00-\u9fff、()（）]*?要求[\u4e00-\u9fff、()（）]*?\s*$|'  # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
-        r'(?:采购.*?(?:内容|要求|需求)?|招标(?:内容|要求|需求))[\u4e00-\u9fff、()（）]*?|'  
+        r'(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求))[\u4e00-\u9fff、()（）]*?|'  
        r'需求书[\u4e00-\u9fff、()（）]*?)\s*$',
        regex.MULTILINE
    )
@ -525,7 +525,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                elif selection == 5:
                    begin_pattern = regex.compile(
                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)?|招标(?:内容|要求|需求)).*|'
+                        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|'
                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
                    )
                    end_pattern = regex.compile(