11.13 无效投标

2024-11-13 17:24:17 +08:00 · 2024-11-13 17:24:17 +08:00 · 6e0b2e4a68
commit 6e0b2e4a68
parent b61d240957
6 changed files with 80 additions and 52 deletions
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@ -33,6 +33,9 @@ def handle_content_append(current_content, line_content, append_newline, keyword
        current_content.append('\n')
    return append_newline

+
+import re
+
 def parse_text_by_heading(text):
    keywords = ['包含', '以下']
    data = {}
@ -44,10 +47,16 @@ def parse_text_by_heading(text):
    skip_subheadings = False
    last_main_number = None
    temp_title = None  # 临时存储以点号开头但不带数字的标题
-    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')   #一、
-    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*') #（一）
+
+    # 定义所有需要的正则表达式模式
+    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')   # 一、
+    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*') # （一）
+    pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$')  # 初始扫描时匹配 A内容 或 A. 内容
+    pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$')  # 主循环中严格匹配 A. 内容
+    pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$')    # 1、内容
+
    initial_heading_pattern = None
-    special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括：']  # 定义特殊章节关键词
+    special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括：','雷同认定']  # 定义特殊章节关键词
    in_special_section = False  # 标志是否在特殊章节中
    lines = text.split('\n')

@ -69,6 +78,18 @@ def parse_text_by_heading(text):
                return True
        return False

+    # 预先扫描前5行，检查是否有匹配任何标题模式
+    first_five_lines = lines[:5]
+    has_initial_heading_patterns = False
+    for line in first_five_lines:
+        line_stripped = line.strip().replace('．', '.')
+        if (pattern_numbered.match(line_stripped) or
+            pattern_parentheses.match(line_stripped) or
+            pattern_letter_initial.match(line_stripped) or
+            pattern_arabic.match(line_stripped)):
+            has_initial_heading_patterns = True
+            break
+
    for i, line in enumerate(lines):
        line_stripped = line.strip().replace('．', '.')

@ -97,12 +118,11 @@ def parse_text_by_heading(text):
        dot_text_match = re.match(r'^[．.]\s*(\D.+)$', line_stripped)

        # 匹配不带点号的纯数字开头的情况，例如 '27xxxxx'
-        # pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
-        pure_number_match = re.match(r'^(\d+)([^.\d)(）].*)', line_stripped)  #不允许出现右括号
+        pure_number_match = re.match(r'^(\d+)([^.\d)(）].*)', line_stripped)  # 不允许出现右括号

        if match:
            new_key, line_content = match.groups()
-            line_content = line_content.lstrip('.．、,')
+            line_content = line_content.lstrip('.．、,').replace(' ', '')

            if any(keyword in line_content for keyword in keywords):
                skip_subheadings = True
@ -127,13 +147,13 @@ def parse_text_by_heading(text):
                    len(current_content) == 0 or current_content[-1][-1] != '第')):
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
-                    data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key
                current_content = [line_content]
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                last_main_number = new_key.split('.')[0]
            else:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        elif dot_match:
            # 处理以点号开头并带有数字的情况
@ -143,12 +163,12 @@ def parse_text_by_heading(text):
                new_key = f"{last_main_number}.{sub_number}"
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
-                    data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key
                current_content = [line_content]
                append_newline = True
            else:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        elif dot_text_match:
            # 处理以点号开头但不带数字的情况，存储到临时变量
@ -159,7 +179,7 @@ def parse_text_by_heading(text):
            # 处理不带点号的纯数字开头的情况
            new_key_candidate, line_content = pure_number_match.groups()
            new_key_candidate += '.'  # 添加点号
-            line_content = line_content.lstrip('.．、,')
+            line_content = line_content.lstrip('.．、,').replace(' ', '')

            # 提取当前键名和新键名的数值部分
            def extract_number(key):
@ -179,29 +199,30 @@ def parse_text_by_heading(text):
                # 开始新的标题
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
-                    data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key_candidate
                current_content = [line_content]
                append_newline = True
                last_main_number = new_key_candidate.rstrip('.')
            else:
                # 将当前行视为当前标题的内容
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        else:
-            if not skip_subheadings and not in_special_section:  # 增加 in_special_section 判断
+            # 根据预先设置的标志决定是否执行这部分代码
+            if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
                numbered_match = pattern_numbered.match(line_stripped)
-                parenthesis_match = pattern_parentheses.match(line_stripped)
-                letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
-                arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
+                parentheses_match = pattern_parentheses.match(line_stripped)
+                letter_match = pattern_letter.match(line_stripped)
+                arabic_match = pattern_arabic.match(line_stripped)

                # 判断当前行是否匹配了任何标题模式
-                if numbered_match or parenthesis_match or letter_match or arabic_match:
+                if numbered_match or parentheses_match or letter_match or arabic_match:
                    # 如果初始标题模式尚未设置，则记录当前匹配的标题模式
                    if initial_heading_pattern is None:
                        if numbered_match:
                            initial_heading_pattern = 'numbered'
-                        elif parenthesis_match:
+                        elif parentheses_match:
                            initial_heading_pattern = 'parentheses'
                        elif letter_match:
                            initial_heading_pattern = 'letter'
@ -211,7 +232,7 @@ def parse_text_by_heading(text):
                    # 确定当前匹配的标题模式
                    if numbered_match:
                        current_heading_pattern = 'numbered'
-                    elif parenthesis_match:
+                    elif parentheses_match:
                        current_heading_pattern = 'parentheses'
                    elif letter_match:
                        current_heading_pattern = 'letter'
@ -223,7 +244,7 @@ def parse_text_by_heading(text):
                        # 保存之前的 key 的内容
                        if current_key is not None:
                            content_string = ''.join(current_content).strip()
-                            data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+                            data[current_key] = data.get(current_key, '') + content_string
                        if current_key_chinese is not None:
                            data[current_key_chinese] = current_value_chinese
                            current_key_chinese = None
@ -233,8 +254,8 @@ def parse_text_by_heading(text):
                            current_key_chinese = numbered_match.group(1)
                            current_value_chinese = line_stripped[numbered_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'parentheses':
-                            current_key_chinese = parenthesis_match.group(1)
-                            current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('.．、,').replace(' ', '')
+                            current_key_chinese = parentheses_match.group(1)
+                            current_value_chinese = line_stripped[parentheses_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'letter':
                            # 字母标题处理
                            letter_key, letter_value = letter_match.groups()
@ -248,27 +269,27 @@ def parse_text_by_heading(text):
                        elif current_heading_pattern == 'arabic':
                            arabic_key, arabic_value = arabic_match.groups()
                            current_key = arabic_key.replace('、', '.')
-                            current_content = [arabic_value]
+                            current_content = [arabic_value.replace(' ', '')]
                            append_newline = True
                            last_main_number = current_key.rstrip('.')
                        continue
                    else:
                        # 当前标题模式与初始模式不一致，将该行视为内容
                        if line_stripped:
-                            append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
+                            append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
                else:
                    # 未匹配到任何标题模式，将该行视为内容
                    if line_stripped:
-                        append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
+                        append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
            else:
                # 在特殊章节中，所有内容都作为当前标题的内容
                if line_stripped:
-                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
+                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

    # 最后保存最后一个 key 对应的内容
    if current_key is not None:
        content_string = ''.join(current_content).strip()
-        data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
+        data[current_key] = data.get(current_key, '') + content_string
    if current_key_chinese is not None:
        data[current_key_chinese] = current_value_chinese
    if temp_title:
@ -277,6 +298,10 @@ def parse_text_by_heading(text):

    return data

+
+
+
+
 def extract_text_from_pdf(file_path, start_word, end_pattern):
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
@ -308,3 +333,4 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
    # 合并所有页面的文本
    full_text = "\n".join(all_pages_text)
    return full_text
+
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -96,7 +96,7 @@ def extract_text_by_page(file_path):

 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
+    file_path =r"C:\Users\Administrator\Desktop\货物标\output4_2\招标文件111_tobidders_notice_part2.pdf"
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -591,13 +591,13 @@ if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
-    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
+    input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
    # files=truncate_pdf_multiple(input_path,output_folder)
    # selections = [4, 1]  # 仅处理 selection 4、1
    # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
    # print(files)
-    selection = 3  # 例如：1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
+    selection = 1  # 例如：1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
    generated_files = truncate_pdf_main(input_path, output_folder, selection)
    print(generated_files)
    # print("生成的文件:", generated_files)
--- a/flask_app/main/提取json工程标版.py
+++ b/flask_app/main/提取json工程标版.py
@ -12,11 +12,12 @@ def convert_clause_to_json(file_path, output_folder, type=1):
            r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
        )
        end_pattern = (
-            r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
+            # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
+            r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'
            r'评标办法前附表|'
            r'附录(?:一)?[：:]|'
            r'附件(?:一)?[：:]|'
-            r'附表(?:一)?[：:])$'
+            r'附表(?:一)?[：:]'
        )
    else:
        # 对于其他类型，使用原始字符串定义 start_word 和 end_pattern
@ -29,7 +30,8 @@ def convert_clause_to_json(file_path, output_folder, type=1):
            r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)）]?\s*$'
        )
        end_pattern = (
-            r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$'
+            r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$|'
+            r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$'
        )

    if file_path.endswith('.pdf'):
@ -91,8 +93,9 @@ def convert_clause_to_json(file_path, output_folder, type=1):
    #     json.dump(processed_data, file, ensure_ascii=False, indent=4)

 if __name__ == "__main__":
-    file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
-    output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
+    # file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
+    file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf'
+    output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder)
        print(f"Final JSON result saved to: {output_path}")
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -80,7 +80,7 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                          output_suffix="normal"):
    start_page = None
    end_page = None
-    for i, page in enumerate(pdf_document.pages):
+    for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
        text = page.extract_text() or ""
        cleaned_text = clean_page_content(text, common_header)
        if output_suffix == "tobidders_notice":
@ -430,6 +430,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
 #     return path1, path2

 def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
+    output_suffix="tobidders_notice"
    begin_pattern = re.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
    )
@ -462,8 +463,8 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
            return start_page1, end_page1,end_page1

    # 如果不包含排除关键词，继续提取第二部分
-    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
-                                         exclusion_pattern)
+    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2-1, common_header,
+                                         exclusion_pattern,output_suffix)

    if end_page2 is None:
        return start_page1, end_page1,end_page1
@ -471,9 +472,6 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
    return start_page1, end_page1,end_page2


-import re
-
-
 def extract_pages_qualification(pdf_document, begin_page, common_header):
    # 开始匹配模式，仅匹配“附录”、“附件”或“附表”
    begin_pattern = re.compile(
@ -834,16 +832,16 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
 #ztbfile.pdf少资格评审  包头少符合性评审
 if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
-    input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
+    input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
-    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3"
+    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output4_2"
    # files = truncate_pdf_multiple(input_path, output_folder)
    # selections = [1,4]
    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
    # print(files)
-    selection = 3# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
+    selection = 4# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
    generated_files = truncate_pdf_main(input_path, output_folder, selection)
    print(generated_files)
--- a/flask_app/货物标/提取json货物标版.py
+++ b/flask_app/货物标/提取json货物标版.py
@ -128,13 +128,14 @@ def clean_content(content):
    return cleaned_content

 #如果file_path为空，返回""
+#TODO:这里的start_word end_pattern可以优化
 def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
    if not os.path.exists(file_path):
        print(f"The specified file does not exist: {file_path}")
        return ""
    if type == 1:
        start_word = r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则)'
-        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
+        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*)$'
    else:
        start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*)$'
        end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
@ -175,10 +176,10 @@ def process_folder(input_folder, output_folder):

 #TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
 #TODO: 投标人须知正文这块，序号可能是乱序的，或许可以删除判断序号大小的逻辑，只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
-
+#TODO:招标文件111_tobidders_notice_part2.pdf   陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zb_tobidders_notice.pdf'
+    file_path=r'C:\Users\Administrator\Desktop\货物标\output4_2\招标文件111_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp'
    try: